'; exit(); } load_settings(); print_stats_top(); crawler_loop(microtime(true), microtime(true), 0); /** * Set scripts variables. */ function init_variables() { // Set PHP INI Variables ini_set('memory_limit', '256M'); ini_set('max_execution_time', 360); ini_set('output_buffering', 'off'); Global $url_base, $url_prefix, $ip_address_bypass, $script_name, $script_subdir, $system_php_interperter, $system_cwd, $system_temp_file, $system_log_file, $php_timeout, $cpu_timeout; // www. or ___. or leave blank $url_prefix = 'www.'; // example.com/ no http:// no www. must have trailing / $url_base = 'example.com/'; // sub directory on server, leave blank if this script is in root $script_subdir = 'cache/'; // name of this file $script_name = 'boost_crawler.php'; // location of php interperter: /web/cgi-bin/php5 /usr/bin/php $system_php_interperter = '/web/cgi-bin/php5'; // current working dir of webroot: $HOME/html/ /var/www/vhosts/example.com/ $system_cwd = '$HOME/html/'; // allow this ip to run script: 127.0.0.1 $ip_address_bypass = '127.0.0.1'; // name of temp file to hold array $system_temp_file = 'temp_crawler_settings.txt'; // name of log file $system_log_file = 'log.txt'; // PHP Script timeout $php_timeout = 120; // max excution time of Script (CPU) $cpu_timeout = 120; } /** * Read in temp file. */ function load_settings() { // variables needed Global $system_temp_file, $url_base, $url_prefix; // variables set Global $urls_all, $urls_crawled, $script_settings; $temp_array = array(); if (file_exists($system_temp_file)) { $temp_array = unserialize(file_get_contents($system_temp_file)); $urls_all = array_shift($temp_array); $urls_crawled = array_shift($temp_array); $script_settings = array_shift($temp_array); } else { $urls_all = array("http://" . $url_prefix . $url_base); //"prime the pump" $urls_crawled = array(); $script_settings = array('time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); } } /** * Check that script should start to crawl. */ function check_credentials() { Global $url_prefix, $url_base, $script_subdir, $script_name, $ip_address_bypass; if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($ip_address_bypass == $_SERVER['REMOTE_ADDR'])) { return TRUE; } else { return FALSE; } } //Call Self via URL if loaded via system call. else { file_get_contents('http://' . $url_prefix . $url_base . $script_subdir . $script_name); exit(); } } /** * Output usefull info. */ function print_stats_top() { Global $system_temp_file, $urls_all, $urls_crawled, $script_settings; echo '
';
  echo 'Get Data From: ' . $system_temp_file . "\n";
  echo 'Total Crawler Time So Far: ' . round(microtime(true) - $script_settings['time'],2) . ' Seconds. ' . count($urls_all) . '/' . count($urls_crawled) . ' Pages Crawled.' . "\n\n";
}

/**
 * Loop that crawls site.
 */
function crawler_loop($start, $end, $t_total) {
  Global $urls_all, $urls_crawled, $php_timeout, $cpu_timeout, $script_settings;
  
  // set counter
  $starting_count = count($urls_crawled);
  //slice the urls_all array
  $sliced_urls_list = array_slice($urls_all, $starting_count);
  
  $loop_counter = 0;
  
  foreach ($sliced_urls_list as $url) {
    $loop_counter++;
    // crawl url
    get_new_urls($url);
    
    // print output to screen
    print_crawler_stats($start, $end, $t_total, $loop_counter, $starting_count, $url);

    //Timer has gone over the limit; end this run and restart
    if ((round($t_total/1000000.0, 4) >= $cpu_timeout) || (round($end - $start,2) >= $php_timeout)) {
      $script_settings['iterations']++;
      write_temp_file();
      restart_script();
    }
    else {
      // wait for 0.1 seconds
      usleep(100000);
    }
    // write file every 100 urls crawled
    if(($loop_counter % 100) == 0) {
       write_temp_file();
    }
  }
  write_temp_file();
  
  // if array count doesn't match, there's more to crawl.
  if (count($urls_all) != count($urls_crawled)) {
    crawler_loop($start, $end, $t_total);
  }
  else {
    end_of_crawler();
  }
}

/**
 * Adds new url's to arrays.
 */
function get_new_urls($url) {
  Global $urls_all, $urls_crawled;
  // crawl url
  $newlinks = load_page($url);
  // add crawled url to done list
  $urls_crawled[] = $url;
  foreach($newlinks as $newurl) {
    // add links to the "to-do" array
    $urls_all[] = $newurl;
  }
}

/**
 * Outputs current stats to screen.
 */
function print_crawler_stats(&$start, &$end, &$t_total, $loop_counter, $starting_count, $url) {
    Global $script_settings;

  // Timers
  $beg = $end;
  $end = microtime(true); //stop clock
  $dat = getrusage(); //stop clock
  $t_after = $dat['ru_utime.tv_usec'];
  $t_total += $t_after;
  
  // Output Info
  echo ($loop_counter+$starting_count) . ", ";
  echo number_format($end - $start, 2, '.', ''). ", ";
  echo number_format($end - $beg, 3, '.', ''). ", ";
  echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
  echo "Crawling " .urldecode(urldecode($url)). "\n";
  
  // Set Max Stat
  if (($end-$beg) > $script_settings['max']) {
    $script_settings['max'] = ($end-$beg);
    $script_settings['max_name'] = urldecode(urldecode($url));
  }
  flush();
  set_time_limit(0); //reset any time limit on php script
}

/**
 * Writes settings to local file.
 */
function write_temp_file() {
  Global $urls_all, $urls_crawled, $script_settings, $php_timeout, $cpu_timeout, $system_temp_file;
  echo 'writing';
  $urls_all = array_unique($urls_all);
  $urls_crawled = array_unique($urls_crawled);
  
  $temp_array = array();
  $temp_array[] = $urls_all;
  $temp_array[] = $urls_crawled;
  $temp_array[] = $script_settings;
  $s = serialize($temp_array);
  echo '... ';
  echo 'wrote ' . file_put_contents($system_temp_file, $s) . " bytes to \t" . $system_temp_file . "\n";
}

/**
 * Restarts script.
 */
function restart_script() {
  global $system_php_interperter, $system_cwd, $script_subdir, $script_name;

  Sleep(1); //Wait One Second
  $command = $system_php_interperter . ' "' . $system_cwd . $script_subdir . $script_name . '" > /dev/null &';
  call_self_via_system($command); //Call Self

  exit(); //Kill This Process
}

/**
 * Calls self via system call.
 */
function call_self_via_system($command) {
  $dump = array();
  $start = microtime(true);
  exec($command, $dump);
  $end = microtime(true);
  echo "\n" . $command;
  echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (less then a second) means async execution worked most likely.";
  echo "\n" . print_r($dump, true);
  echo '
'; } /** * Outputs final stats and writes to log file. */ function end_of_crawler() { Global $urls_all, $urls_crawled, $script_settings, $cpu_timeout, $system_temp_file, $system_log_file; //While Loop Done; Crawling of stie is Complete //Output Interesting Stats echo count($urls_all) . ", " . count($urls_crawled) . "\n"; echo "Total Time: " . round((microtime(true) - $script_settings['time'])/60.0,2) . " Minutes\n"; echo ''; ob_flush();//Flush Buffers //Write To Log $txt = '[' . date(DATE_RFC822) . ']'; $txt .= "\t" . 'Crawled:' . count($urls_crawled) . '/' . count($urls_all); $txt .= "\t" . 'Total Time: ' . round((microtime(true) - $script_settings['time'])/60.0,2) . ' Minutes'; $txt .= "\t" . 'Script Iterations: ' . $script_settings['iterations']; $txt .= "\t" . 'Slowest Page: ' . number_format($script_settings['max'], 2, '.', '') . " Seconds " . $script_settings['max_name'] . "\n"; file_put_contents($system_log_file, $txt, FILE_APPEND); @unlink($system_temp_file); //Indexing Complete, Kill Temp Files exit(); //End Of Program } /** * Web Crawler */ function load_page($url) { Global $url_base, $url_prefix; //download file $var = file_get_contents($url); //get all links preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches); //process links $matches = $matches[1]; $temp_list = array(); foreach($matches as $page) { //kill http://www.yoursite.com/ ($base_url) $temp = str_replace($url_base,"",strstr($page, $url_base)); if (strlen($temp)>0) { $page = $temp; } //kill any external links & pics if (!( strlen(strstr($page, 'http://'))>0 || strlen(strstr($page, 'www.'))>0 || strlen(strstr($page, 'mailto:'))>0 || strlen(strstr($page, '.jpg'))>0 || strlen(strstr($page, '.JPG'))>0 || strlen(strstr($page, '.gif'))>0 || strlen(strstr($page, '.png'))>0 || strlen(strstr($page, '#'))>0 || strlen(strstr($page, '.xml'))>0 )) { //store url in array $temp_list[] = "http://" . $url_prefix . $url_base . $page; } } //Round 2 of link checking $matches = array(); foreach($temp_list as $page) { if (!(strlen(strstr($page, 'http://' . $url_prefix . $base_url))<0 || substr_count($page, '://')>1)) { $page = str_replace($url_base . "/", $url_base, $page); $matches[] = $page; } } //remove duplicate links return array_unique($matches); }