'; exit(); } load_settings(); $output = print_stats_top(); async_opp($output); crawler_loop(microtime(true), microtime(true)); /** * Set scripts variables. */ function init_variables() { // Set PHP INI Variables ini_set('memory_limit', '256M'); ini_set('max_execution_time', 2000); ini_set('output_buffering', 'off'); Global $url_base, $url_prefix, $webroot_subdir, $ip_address_bypass, $script_name, $script_subdir, $system_php_interperter, $system_cwd, $system_temp_file, $system_log_file, $php_timeout, $wait_time; // example.com/ no http:// no www. must have trailing / $url_base = 'example.com/'; // www. or ___. or leave blank $url_prefix = 'www.'; // drupal/ or leave blank must have trailing / $webroot_subdir = ''; // allow this ip to run script: 127.0.0.1 $ip_address_bypass = '127.0.0.1'; // name of this file $script_name = 'boost_crawler.php'; // sub directory on server, leave blank if root $script_subdir = 'cache/'; // name of temp file to hold array $system_temp_file = 'temp_crawler_settings.txt'; // name of log file $system_log_file = 'log.txt'; // PHP Script timeout $php_timeout = 120; // Throttle: 1,000,000 = 1 second //$wait_time = 25000; } /** * Read in temp file. */ function load_settings() { // variables needed Global $system_temp_file, $url_base, $url_prefix, $webroot_subdir; // variables set Global $urls_all, $script_settings; $temp_array = array(); if (file_exists($system_temp_file)) { $temp_array = unserialize(@file_get_contents($system_temp_file)); $urls_all = array_shift($temp_array); $script_settings = array_shift($temp_array); } else { $urls_all = array("http://" . $url_prefix . $url_base . $webroot_subdir); //"prime the pump" $script_settings = array('count' => 0, 'time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); } } /** * Check that script should start to crawl. */ function check_credentials() { Global $url_prefix, $url_base, $script_subdir, $script_name, $ip_address_bypass, $webroot_subdir; if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($ip_address_bypass == $_SERVER['REMOTE_ADDR'])) { return TRUE; } else { return FALSE; } } //Call Self via URL if loaded via system call. else { @file_get_contents('http://' . $url_prefix . $url_base . $webroot_subdir . $script_subdir . $script_name . '?async'); exit(); } } /** * Output usefull info. */ function print_stats_top() { Global $system_temp_file, $urls_all, $urls_crawled, $script_settings; $output = '
';
$output .= 'Get Data From: ' . $system_temp_file . "\n";
$output .= 'Total Crawler Time So Far: ' . round(microtime(true) - $script_settings['time'],2) . ' Seconds. ' . count($urls_all) . '/' . $script_settings['count'] . ' Pages Crawled.' . "\n\n";
return $output;
}
/**
* Loop that crawls site.
*/
function crawler_loop($start, $end) {
Global $urls_all, $php_timeout, $script_settings, $wait_time;
// set counter
$starting_count = $script_settings['count'];
//slice the urls_all array
$sliced_urls_list = array_slice($urls_all, $starting_count);
$loop_counter = 0;
foreach ($sliced_urls_list as $url) {
$loop_counter++;
// crawl url
get_new_urls($url);
// Print output to screen
print_crawler_stats($start, $end, $loop_counter, $starting_count, $url);
// Reset any time limit on php script
set_time_limit(0);
//Timer has gone over the limit; end this run and restart
if ((round($end - $start,2) >= $php_timeout)) {
$script_settings['iterations']++;
write_temp_file();
restart_script();
}
else {
// Wait so we don't melt the server.
//usleep($wait_time);
}
// write file every 100 urls crawled
if(($loop_counter % 100) == 0) {
write_temp_file();
}
}
write_temp_file();
// if array count doesn't match, there's more to crawl.
if (count($urls_all) != $script_settings['count']) {
crawler_loop($start, $end);
}
else {
end_of_crawler();
}
}
function get_new_urls($url) {
Global $urls_all, $script_settings;
// crawl url
$newlinks = load_page($url);
// add crawled url to done list
$script_settings['count']++;
foreach($newlinks as $newurl) {
// add links to the "to-do" array
$urls_all[] = $newurl;
}
}
function print_crawler_stats(&$start, &$end, $loop_counter, $starting_count, $url) {
Global $script_settings;
// Timers
$beg = $end;
$end = microtime(true); //stop clock
// Set Max Stat
if (($end-$beg) > $script_settings['max']) {
$script_settings['max'] = ($end-$beg);
$script_settings['max_name'] = urldecode(urldecode($url));
}
}
function write_temp_file() {
Global $urls_all, $script_settings, $php_timeout, $system_temp_file;
$urls_all = array_unique($urls_all);
$temp_array = array();
$temp_array[] = $urls_all;
$temp_array[] = $script_settings;
$s = serialize($temp_array);
@file_put_contents($system_temp_file, $s);
chmod($system_temp_file, 0666);
}
function restart_script() {
call_self_via_url();
exit(); //Kill This Process
}
/**
* Calls self via url.
*/
function call_self_via_url() {
Global $url_prefix, $url_base, $script_subdir, $script_name, $webroot_subdir;
return file_get_contents('http://' . $url_prefix . $url_base . $webroot_subdir . $script_subdir . $script_name . '?async');
}
function end_of_crawler() {
Global $urls_all, $script_settings, $system_temp_file, $system_log_file;
//While Loop Done; Crawling of stie is Complete
//Output Interesting Stats
//Write To Log
$txt = '[' . date(DATE_RFC822) . ']';
$txt .= "\t" . 'Crawled:' . $script_settings['count'] . '/' . count($urls_all);
$txt .= "\t" . 'Total Time: ' . round((microtime(true) - $script_settings['time'])/60.0,2) . ' Minutes';
$txt .= "\t" . 'Script Iterations: ' . $script_settings['iterations'];
$txt .= "\t" . 'Slowest Page: ' . number_format($script_settings['max'], 2, '.', '') . " Seconds " . $script_settings['max_name'] . "\n";
@file_put_contents($system_log_file, $txt, FILE_APPEND);
@unlink($system_temp_file); //Indexing Complete, Kill Temp Files
exit(); //End Of Program
}
/**
* Output text & set php in async mode.
*/
function async_opp($output) {
// Prime php for background operations
ob_end_clean();
header("Connection: close");
ignore_user_abort();
// Output text
ob_start();
header("Content-type: text/html");
header("Expires: Wed, 11 Nov 1998 11:11:11 GMT");
header("Cache-Control: no-cache");
header("Cache-Control: must-revalidate");
header("Content-Length: " . (strlen($output)-1));
header("Connection: close");
print($output);
ob_end_flush();
flush();
// text returned and connection closed.
// Do background processing. Time taken after should not effect page load times.
}
//Web Crawler & Link Get
function load_page($url) {
Global $url_base, $url_prefix;
//download file
$var = @file_get_contents($url);
//get all links
preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
//process links
$matches = $matches[1];
$temp_list = array();
foreach($matches as $page) {
//kill http://www.yoursite.com/ ($base_url)
$temp = str_replace($url_base,"",strstr($page, $url_base));
if (strlen($temp)>0) {
$page = $temp;
}
//kill any external links & pics
if (!( strlen(strstr($page, 'http://'))>0
|| strlen(strstr($page, 'www.'))>0
|| strlen(strstr($page, 'mailto:'))>0
|| strlen(strstr($page, '.jpg'))>0
|| strlen(strstr($page, '.JPG'))>0
|| strlen(strstr($page, '.gif'))>0
|| strlen(strstr($page, '.png'))>0
|| strlen(strstr($page, '#'))>0
|| strlen(strstr($page, '.xml'))>0
)) {
//store url in array
$temp_list[] = "http://" . $url_prefix . $url_base . $page;
}
}
//Round 2 of link checking
$matches = array();
foreach($temp_list as $page) {
if (!(strlen(strstr($page, 'http://' . $url_prefix . $base_url))<0 || substr_count($page, '://')>1)) {
$page = str_replace($url_base . "/", $url_base, $page);
$matches[] = $page;
}
}
//remove duplicate links
return array_unique($matches);
}