'; exit(); } load_settings(); $output = print_stats_top(); async_opp($output); crawler_loop(microtime(true), microtime(true)); /** * Set scripts variables. */ function init_variables() { // Set PHP INI Variables ini_set('memory_limit', '256M'); ini_set('max_execution_time', 2000); ini_set('output_buffering', 'off'); Global $url_base, $url_prefix, $webroot_subdir, $ip_address_bypass, $script_name, $script_subdir, $system_php_interperter, $system_cwd, $system_temp_file, $system_log_file, $php_timeout, $wait_time; // example.com/ no http:// no www. must have trailing / $url_base = 'example.com/'; // www. or ___. or leave blank $url_prefix = 'www.'; // drupal/ or leave blank must have trailing / $webroot_subdir = ''; // allow this ip to run script: 127.0.0.1 $ip_address_bypass = '127.0.0.1'; // name of this file $script_name = 'boost_crawler.php'; // sub directory on server, leave blank if root $script_subdir = 'cache/'; // name of temp file to hold array $system_temp_file = 'temp_crawler_settings.txt'; // name of log file $system_log_file = 'log.txt'; // PHP Script timeout $php_timeout = 120; // Throttle: 1,000,000 = 1 second //$wait_time = 25000; } /** * Read in temp file. */ function load_settings() { // variables needed Global $system_temp_file, $url_base, $url_prefix, $webroot_subdir; // variables set Global $urls_all, $script_settings; $temp_array = array(); if (file_exists($system_temp_file)) { $temp_array = unserialize(@file_get_contents($system_temp_file)); $urls_all = array_shift($temp_array); $script_settings = array_shift($temp_array); } else { $urls_all = array("http://" . $url_prefix . $url_base . $webroot_subdir); //"prime the pump" $script_settings = array('count' => 0, 'time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); } } /** * Check that script should start to crawl. */ function check_credentials() { Global $url_prefix, $url_base, $script_subdir, $script_name, $ip_address_bypass, $webroot_subdir; if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($ip_address_bypass == $_SERVER['REMOTE_ADDR'])) { return TRUE; } else { return FALSE; } } //Call Self via URL if loaded via system call. else { @file_get_contents('http://' . $url_prefix . $url_base . $webroot_subdir . $script_subdir . $script_name . '?async'); exit(); } } /** * Output usefull info. */ function print_stats_top() { Global $system_temp_file, $urls_all, $urls_crawled, $script_settings; $output = '
';
  $output .= 'Get Data From: ' . $system_temp_file . "\n";
  $output .= 'Total Crawler Time So Far: ' . round(microtime(true) - $script_settings['time'],2) . ' Seconds. ' . count($urls_all) . '/' . $script_settings['count'] . ' Pages Crawled.' . "\n\n";
  return $output;
}

/**
 * Loop that crawls site.
 */
function crawler_loop($start, $end) {
  Global $urls_all, $php_timeout, $script_settings, $wait_time;
  
  // set counter
  $starting_count = $script_settings['count'];
  //slice the urls_all array
  $sliced_urls_list = array_slice($urls_all, $starting_count);
  
  $loop_counter = 0;
  
  foreach ($sliced_urls_list as $url) {
    $loop_counter++;
    // crawl url
    get_new_urls($url);
    
    // Print output to screen
    print_crawler_stats($start, $end, $loop_counter, $starting_count, $url);

    // Reset any time limit on php script
    set_time_limit(0);

    //Timer has gone over the limit; end this run and restart
    if ((round($end - $start,2) >= $php_timeout)) {
      $script_settings['iterations']++;
      write_temp_file();
      restart_script();
    }
    else {
      // Wait so we don't melt the server.
      //usleep($wait_time);
    }
    // write file every 100 urls crawled
    if(($loop_counter % 100) == 0) {
      write_temp_file();
    }
  }
  write_temp_file();
  
  // if array count doesn't match, there's more to crawl.
  if (count($urls_all) != $script_settings['count']) {
    crawler_loop($start, $end);
  }
  else {
    end_of_crawler();
  }
}

function get_new_urls($url) {
  Global $urls_all, $script_settings;
  // crawl url
  $newlinks = load_page($url);
  // add crawled url to done list
  $script_settings['count']++;
  foreach($newlinks as $newurl) {
    // add links to the "to-do" array
    $urls_all[] = $newurl;
  }
}

function print_crawler_stats(&$start, &$end, $loop_counter, $starting_count, $url) {
    Global $script_settings;

  // Timers
  $beg = $end;
  $end = microtime(true); //stop clock
  
  // Set Max Stat
  if (($end-$beg) > $script_settings['max']) {
    $script_settings['max'] = ($end-$beg);
    $script_settings['max_name'] = urldecode(urldecode($url));
  }
}

function write_temp_file() {
  Global $urls_all, $script_settings, $php_timeout, $system_temp_file;
  $urls_all = array_unique($urls_all);
  
  $temp_array = array();
  $temp_array[] = $urls_all;
  $temp_array[] = $script_settings;
  $s = serialize($temp_array);

  @file_put_contents($system_temp_file, $s);
  chmod($system_temp_file, 0666);
}

function restart_script() {
  call_self_via_url();
  exit(); //Kill This Process
}

/**
 * Calls self via url.
 */
function call_self_via_url() {
  Global $url_prefix, $url_base, $script_subdir, $script_name, $webroot_subdir;
  return file_get_contents('http://' . $url_prefix . $url_base . $webroot_subdir . $script_subdir . $script_name . '?async');
}

function end_of_crawler() {
  Global $urls_all, $script_settings, $system_temp_file, $system_log_file;
  //While Loop Done; Crawling of stie is Complete

  //Output Interesting Stats

  //Write To Log
  $txt  =  '[' . date(DATE_RFC822) . ']';
  $txt .= "\t" . 'Crawled:' . $script_settings['count'] . '/' . count($urls_all);
  $txt .= "\t" . 'Total Time: ' . round((microtime(true) - $script_settings['time'])/60.0,2) . ' Minutes';
  $txt .= "\t" . 'Script Iterations: ' . $script_settings['iterations'];
  $txt .= "\t" . 'Slowest Page: ' . number_format($script_settings['max'], 2, '.', '') . " Seconds " . $script_settings['max_name'] . "\n";
  @file_put_contents($system_log_file, $txt, FILE_APPEND);
  @unlink($system_temp_file); //Indexing Complete, Kill Temp Files

  exit(); //End Of Program
}

/**
 * Output text & set php in async mode.
 */
function async_opp($output) {
  // Prime php for background operations
  ob_end_clean();
  header("Connection: close");
  ignore_user_abort();

  // Output text
  ob_start();
  header("Content-type: text/html");
  header("Expires: Wed, 11 Nov 1998 11:11:11 GMT");
  header("Cache-Control: no-cache");
  header("Cache-Control: must-revalidate");
  header("Content-Length: " . (strlen($output)-1));
  header("Connection: close");
  print($output);
  ob_end_flush();
  flush();

  // text returned and connection closed.
  // Do background processing. Time taken after should not effect page load times.
}

//Web Crawler & Link Get
function load_page($url) {
  Global $url_base, $url_prefix;
  
  //download file
  $var = @file_get_contents($url);
  //get all links
  preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
  
  //process links
  $matches = $matches[1];
  $temp_list = array();
  foreach($matches as $page) {
    //kill http://www.yoursite.com/ ($base_url)
    $temp = str_replace($url_base,"",strstr($page, $url_base));
    if (strlen($temp)>0) {
      $page = $temp;
    }
    
    //kill any external links & pics
    if (!( strlen(strstr($page, 'http://'))>0
        || strlen(strstr($page, 'www.'))>0
        || strlen(strstr($page, 'mailto:'))>0
        || strlen(strstr($page, '.jpg'))>0
        || strlen(strstr($page, '.JPG'))>0
        || strlen(strstr($page, '.gif'))>0
        || strlen(strstr($page, '.png'))>0
        || strlen(strstr($page, '#'))>0
        || strlen(strstr($page, '.xml'))>0
    )) {
      //store url in array
      $temp_list[] = "http://" . $url_prefix . $url_base . $page;
    }
  }

  //Round 2 of link checking
  $matches = array();
  foreach($temp_list as $page) {
    if (!(strlen(strstr($page, 'http://' . $url_prefix . $base_url))<0 || substr_count($page, '://')>1)) {
      $page = str_replace($url_base . "/", $url_base, $page);
      $matches[] = $page;
    }
  }
  
  //remove duplicate links
  return array_unique($matches);
}