'; exit(); } load_settings(); print_stats_top(); crawler_loop(microtime(true), microtime(true), 0); /** * Set scripts variables. */ function init_variables() { // Set PHP INI Variables ini_set('memory_limit', '256M'); ini_set('max_execution_time', 360); ini_set('output_buffering', 'off'); Global $url_base, $url_prefix, $ip_address_bypass, $script_name, $script_subdir, $system_php_interperter, $system_cwd, $system_temp_file, $system_log_file, $php_timeout, $cpu_timeout; // www. or ___. or leave blank $url_prefix = 'www.'; // example.com/ no http:// no www. must have trailing / $url_base = 'example.com/'; // sub directory on server, leave blank if this script is in root $script_subdir = 'cache/'; // name of this file $script_name = 'boost_crawler.php'; // location of php interperter: /web/cgi-bin/php5 /usr/bin/php $system_php_interperter = '/web/cgi-bin/php5'; // current working dir of webroot: $HOME/html/ /var/www/vhosts/example.com/ $system_cwd = '$HOME/html/'; // allow this ip to run script: 127.0.0.1 $ip_address_bypass = '127.0.0.1'; // name of temp file to hold array $system_temp_file = 'temp_crawler_settings.txt'; // name of log file $system_log_file = 'log.txt'; // PHP Script timeout $php_timeout = 120; // max excution time of Script (CPU) $cpu_timeout = 120; } /** * Read in temp file. */ function load_settings() { // variables needed Global $system_temp_file, $url_base, $url_prefix; // variables set Global $urls_all, $urls_crawled, $script_settings; $temp_array = array(); if (file_exists($system_temp_file)) { $temp_array = unserialize(file_get_contents($system_temp_file)); $urls_all = array_shift($temp_array); $urls_crawled = array_shift($temp_array); $script_settings = array_shift($temp_array); } else { $urls_all = array("http://" . $url_prefix . $url_base); //"prime the pump" $urls_crawled = array(); $script_settings = array('time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); } } /** * Check that script should start to crawl. */ function check_credentials() { Global $url_prefix, $url_base, $script_subdir, $script_name, $ip_address_bypass; if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($ip_address_bypass == $_SERVER['REMOTE_ADDR'])) { return TRUE; } else { return FALSE; } } //Call Self via URL if loaded via system call. else { file_get_contents('http://' . $url_prefix . $url_base . $script_subdir . $script_name); exit(); } } /** * Output usefull info. */ function print_stats_top() { Global $system_temp_file, $urls_all, $urls_crawled, $script_settings; echo '
';
echo 'Get Data From: ' . $system_temp_file . "\n";
echo 'Total Crawler Time So Far: ' . round(microtime(true) - $script_settings['time'],2) . ' Seconds. ' . count($urls_all) . '/' . count($urls_crawled) . ' Pages Crawled.' . "\n\n";
}
/**
* Loop that crawls site.
*/
function crawler_loop($start, $end, $t_total) {
Global $urls_all, $urls_crawled, $php_timeout, $cpu_timeout, $script_settings;
// set counter
$starting_count = count($urls_crawled);
//slice the urls_all array
$sliced_urls_list = array_slice($urls_all, $starting_count);
$loop_counter = 0;
foreach ($sliced_urls_list as $url) {
$loop_counter++;
// crawl url
get_new_urls($url);
// print output to screen
print_crawler_stats($start, $end, $t_total, $loop_counter, $starting_count, $url);
//Timer has gone over the limit; end this run and restart
if ((round($t_total/1000000.0, 4) >= $cpu_timeout) || (round($end - $start,2) >= $php_timeout)) {
$script_settings['iterations']++;
write_temp_file();
restart_script();
}
else {
// wait for 0.1 seconds
usleep(100000);
}
// write file every 100 urls crawled
if(($loop_counter % 100) == 0) {
write_temp_file();
}
}
write_temp_file();
// if array count doesn't match, there's more to crawl.
if (count($urls_all) != count($urls_crawled)) {
crawler_loop($start, $end, $t_total);
}
else {
end_of_crawler();
}
}
/**
* Adds new url's to arrays.
*/
function get_new_urls($url) {
Global $urls_all, $urls_crawled;
// crawl url
$newlinks = load_page($url);
// add crawled url to done list
$urls_crawled[] = $url;
foreach($newlinks as $newurl) {
// add links to the "to-do" array
$urls_all[] = $newurl;
}
}
/**
* Outputs current stats to screen.
*/
function print_crawler_stats(&$start, &$end, &$t_total, $loop_counter, $starting_count, $url) {
Global $script_settings;
// Timers
$beg = $end;
$end = microtime(true); //stop clock
$dat = getrusage(); //stop clock
$t_after = $dat['ru_utime.tv_usec'];
$t_total += $t_after;
// Output Info
echo ($loop_counter+$starting_count) . ", ";
echo number_format($end - $start, 2, '.', ''). ", ";
echo number_format($end - $beg, 3, '.', ''). ", ";
echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
echo "Crawling " .urldecode(urldecode($url)). "\n";
// Set Max Stat
if (($end-$beg) > $script_settings['max']) {
$script_settings['max'] = ($end-$beg);
$script_settings['max_name'] = urldecode(urldecode($url));
}
flush();
set_time_limit(0); //reset any time limit on php script
}
/**
* Writes settings to local file.
*/
function write_temp_file() {
Global $urls_all, $urls_crawled, $script_settings, $php_timeout, $cpu_timeout, $system_temp_file;
echo 'writing';
$urls_all = array_unique($urls_all);
$urls_crawled = array_unique($urls_crawled);
$temp_array = array();
$temp_array[] = $urls_all;
$temp_array[] = $urls_crawled;
$temp_array[] = $script_settings;
$s = serialize($temp_array);
echo '... ';
echo 'wrote ' . file_put_contents($system_temp_file, $s) . " bytes to \t" . $system_temp_file . "\n";
}
/**
* Restarts script.
*/
function restart_script() {
global $system_php_interperter, $system_cwd, $script_subdir, $script_name;
Sleep(1); //Wait One Second
$command = $system_php_interperter . ' "' . $system_cwd . $script_subdir . $script_name . '" > /dev/null &';
call_self_via_system($command); //Call Self
exit(); //Kill This Process
}
/**
* Calls self via system call.
*/
function call_self_via_system($command) {
$dump = array();
$start = microtime(true);
exec($command, $dump);
$end = microtime(true);
echo "\n" . $command;
echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (less then a second) means async execution worked most likely.";
echo "\n" . print_r($dump, true);
echo '';
}
/**
* Outputs final stats and writes to log file.
*/
function end_of_crawler() {
Global $urls_all, $urls_crawled, $script_settings, $cpu_timeout, $system_temp_file, $system_log_file;
//While Loop Done; Crawling of stie is Complete
//Output Interesting Stats
echo count($urls_all) . ", " . count($urls_crawled) . "\n";
echo "Total Time: " . round((microtime(true) - $script_settings['time'])/60.0,2) . " Minutes\n";
echo '';
ob_flush();//Flush Buffers
//Write To Log
$txt = '[' . date(DATE_RFC822) . ']';
$txt .= "\t" . 'Crawled:' . count($urls_crawled) . '/' . count($urls_all);
$txt .= "\t" . 'Total Time: ' . round((microtime(true) - $script_settings['time'])/60.0,2) . ' Minutes';
$txt .= "\t" . 'Script Iterations: ' . $script_settings['iterations'];
$txt .= "\t" . 'Slowest Page: ' . number_format($script_settings['max'], 2, '.', '') . " Seconds " . $script_settings['max_name'] . "\n";
file_put_contents($system_log_file, $txt, FILE_APPEND);
@unlink($system_temp_file); //Indexing Complete, Kill Temp Files
exit(); //End Of Program
}
/**
* Web Crawler
*/
function load_page($url) {
Global $url_base, $url_prefix;
//download file
$var = file_get_contents($url);
//get all links
preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
//process links
$matches = $matches[1];
$temp_list = array();
foreach($matches as $page) {
//kill http://www.yoursite.com/ ($base_url)
$temp = str_replace($url_base,"",strstr($page, $url_base));
if (strlen($temp)>0) {
$page = $temp;
}
//kill any external links & pics
if (!( strlen(strstr($page, 'http://'))>0
|| strlen(strstr($page, 'www.'))>0
|| strlen(strstr($page, 'mailto:'))>0
|| strlen(strstr($page, '.jpg'))>0
|| strlen(strstr($page, '.JPG'))>0
|| strlen(strstr($page, '.gif'))>0
|| strlen(strstr($page, '.png'))>0
|| strlen(strstr($page, '#'))>0
|| strlen(strstr($page, '.xml'))>0
)) {
//store url in array
$temp_list[] = "http://" . $url_prefix . $url_base . $page;
}
}
//Round 2 of link checking
$matches = array();
foreach($temp_list as $page) {
if (!(strlen(strstr($page, 'http://' . $url_prefix . $base_url))<0 || substr_count($page, '://')>1)) {
$page = str_replace($url_base . "/", $url_base, $page);
$matches[] = $page;
}
}
//remove duplicate links
return array_unique($matches);
}