/dev/null &"; //linux only http://www.sitecrafting.com/blog/to-run-php-code-in/ $command = $system_php . $script_location . $async_call; //Runline /** * Crawler Variables */ $url_list = array("http://".$base_url); //"prime the pump" $url_done = array(); $url_count = array('count' => 0, 'time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); //timer for total run $t_after = 0; $t_total = 0; /** * File Variables */ $url_crawler_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk_crawler.txt"; $log_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "log.txt"; /*** Start To Do Stuff! ***/ //access control if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($bypass_ip == $_SERVER['REMOTE_ADDR'])) { //run crawler, script called from self or white listed ip ReadFileIntoArrays($url_list, $url_done, $url_count, $url_crawler_filename); //Read Previous Run In //Output Some Info ob_start(); $end = microtime(true); echo '
';
echo 'Read Data Files From: ' . $url_crawler_filename . "\n";
echo 'Total Time: ' . round($end - $url_count['time'],2). " Seconds. URL's in Queue: " .count($url_list). "\n\n";
ob_flush(); //Flush Buffers
$start = microtime(true); //start timer for this run
//Do The List
//Run though every value in array; works with a growing array
while (list(, $value) = each($url_list)) {
//URL has not been crawled
if (in_array($value, $url_done) == FALSE) {
//crawlers
$url_list = array_unique(array_merge($url_list, GetLinksFromURL($value, $base_url))); //Crawl URL & Add to list
$url_done[] = $value; //add crawled url to done list
$url_done = array_unique($url_done); //remove dups from crawled list
//timers
$beg = $end;
$end = microtime(true); //stop clock
$dat = getrusage(); //stop clock
$t_after = $dat['ru_utime.tv_usec'];
$t_total += $t_after;
//Output Info
echo $url_count['count'] . ", ";
echo number_format($end - $start, 2, '.', ''). ", ";
echo number_format($end - $beg, 3, '.', ''). ", ";
echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
echo "Crawling " .urldecode(urldecode($value)). "\n";
ob_flush(); //Flush Buffers
$url_count['count']++; //Increment URL Crawled Counter
$url_count['max'] = max($url_count['max'], $end - $beg);
if ($url_count['max'] == ($end-$beg)) {
$url_count['max_name'] = urldecode(urldecode($value));
}
set_time_limit(0); //reset any time limit on php script
usleep(100000); // wait for 0.1 seconds
}
//Timer has gone over the limit; end this run and restart
if ((round($t_total/1000000.0, 4) >= $system_timeout) || (round($end - $start,2) >= $php_timeout)) {
$url_count['iterations']++;
StoreArrays($url_list, $url_done, $url_count, $url_crawler_filename); //Write State To Files
Sleep(1); //Wait One Second
CallSelfSystem($command); //Call Self
ob_flush(); //Flush Buffers
ob_end_flush(); //End Output
exit(); //Kill This Process
}
}
//While Loop Done; Crawling of stie is Complete
//Output Interesting Stats
echo count($url_done) .", ". count($url_list) .", ". $url_count['count']. "\n";
echo "Total Time: " . round(($end - $url_count['time'])/60.0,2) . " Minutes\n";
echo '';
ob_flush();//Flush Buffers
//Write To Log
$txt = '';
$txt .= '[' .date(DATE_RFC822). ']';
$txt .= "\t".'Crawled:' .count($url_done). '/' .$url_count['count']. '/' .count($url_list);
$txt .= "\t".'Total Time: ' .round(($end - $url_count['time'])/60.0,2). ' Minutes';
$txt .= "\t".'Script Iterations: ' .$url_count['iterations'];
$txt .= "\t".'Slowest Page: ' .number_format($url_count['max'], 2, '.', ''). " Seconds " .$url_count['max_name']. "\n";
file_put_contents($log_filename, $txt, FILE_APPEND);
@unlink($url_crawler_filename); //Indexing Complete, Kill Temp Files
ob_end_flush(); //End Output
exit(); //End Of Program
}
else {
//exit, script was called from user browser
echo "CRON Script can only be run via system";
exit();
}
}
else {
//remote IP not set, called local
FirstRun($script_url);
}
exit();
/**
* List of functions below
**/
//Web Crawler & Link Get
function GetLinksFromURL($url,$base_url) {
//download file
$var = file_get_contents($url);
//get all links
preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
//process links
$matches = $matches[1];
$temp_list = array();
foreach($matches as $var) {
//kill http://www.yoursite.com/ ($base_url)
$temp = str_replace($base_url,"",strstr($var, $base_url));
if (strlen($temp)>0) {
$var = $temp;
}
//kill any external links & pics
if (strlen(strstr($var, 'http://'))>0) { }
else if(strlen(strstr($var, 'www.'))>0) { }
else if(strlen(strstr($var, 'mailto:'))>0) { }
else if(strlen(strstr($var, '.jpg'))>0) { }
else if(strlen(strstr($var, '.JPG'))>0) { }
else if(strlen(strstr($var, '.gif'))>0) { }
else if(strlen(strstr($var, '.png'))>0) { }
else if(strlen(strstr($var, '#'))>0) { }
else if(strlen(strstr($var, '.xml'))>0) { }
//store url in array
else {
$temp_list[] = "http://".$base_url.$var;
}
}
unset ($var);
unset ($matches);
$matches = array();
//Round 2 of link checking
foreach($temp_list as $var) {
if (strlen(strstr($var, 'http://'.$base_url))<0) { }
else if (substr_count($var, '://')>1) {}
else {
$var = str_replace($base_url."/",$base_url,$var);
$matches[] = $var;
}
}
//remove duplicate links
return array_unique($matches);
}
//Call Self via exec Call
function CallSelfSystem($command) {
$dump = array();
$start = microtime(true);
exec($command, $dump);
$end = microtime(true);
echo "\n" . $command;
echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (0.01) means async execution worked.";
echo '';
}
//Call Self via URL
Function FirstRun($script_url) {
//Output Info
echo "Running " .$script_url. "\n";
echo file_get_contents($script_url);
//end of jumpstart script
exit();
}
//File Functions
function StoreArrays ($url_list, $url_done, $url_count, $url_crawler_filename) {
$temp_array = array();
$temp_array[] = $url_list;
$temp_array[] = $url_done;
$temp_array[] = $url_count;
echo "\nwrote " . file_put_contents($url_crawler_filename, serialize($temp_array)) . " bytes to \t" . $url_crawler_filename . "\n";
}
function ReadFileIntoArrays(&$url_list, &$url_done, &$url_count, $url_crawler_filename) {
$temp_array = array();
if (file_exists($url_crawler_filename)) {
$temp_array = unserialize(file_get_contents($url_crawler_filename));
$url_list = array_shift($temp_array);
$url_done = array_shift($temp_array);
$url_count = array_shift($temp_array);
}
}