/dev/null &"; //linux only http://www.sitecrafting.com/blog/to-run-php-code-in/ $command = $system_php . $script_location . $async_call; //Runline /** * Crawler Variables */ $url_list = array("http://".$base_url); //"prime the pump" $url_done = array(); $url_count = array('count' => 0, 'time' => my_microtime(), 'iterations' => 0, 'max' => 0, 'max_name' => ''); //timer for total run $t_after = 0; $t_total = 0; /** * File Variables */ $url_done_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_done.txt"; $url_list_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir ."temp_junk__url_list.txt"; $url_count_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_count.txt"; $log_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "log.txt"; //access control if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($bypass_ip == $_SERVER['REMOTE_ADDR'])) { //run crawler, script called from self or white listed ip /*** Start To Do Stuff! ***/ ReadFilesIntoArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count, $url_count_filename); //Read Previous Run In //Output Some Info ob_start(); $end = my_microtime(); echo '
';
echo 'Read Data Files From: ' . $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "\n";
echo 'Total Time: ' . round($end - $url_count['time'],2). " Seconds. URL's in Queue: " .count($url_list). "\n\n";
tolet(); //Flush Buffers
$start = my_microtime(); //start timer for this run
//Do The List
//Run though every value in array; works with a growing array
while (list(, $value) = each($url_list)) {
//URL has not been crawled
if (in_array($value, $url_done) == FALSE) {
$temp_list = GetLinksFromURL($value, $base_url); //return array of links from url
$tempA = array_merge($url_list, $temp_list); //merge arrays
unset($url_list); //clear old array
$url_list = KillDuplicates($tempA); //remove dups in array
$url_done[] = $value; //add crawled url to done list
$url_done = KillDuplicates($url_done); //remove dups from crawled list
$beg = $end;
$end = my_microtime(); //stop clock
$dat = getrusage(); //stop clock
$t_after = $dat['ru_utime.tv_usec'];
$t_total += $t_after;
//Output Info
echo $url_count['count'] . ", ";
echo number_format($end - $start, 2, '.', ''). ", ";
echo number_format($end - $beg, 3, '.', ''). ", ";
echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
echo "Crawling " .urldecode(urldecode($value)). "\n";
tolet(); //Flush Buffers
$url_count['count']++; //Increment URL Crawled Counter
$url_count['max'] = max($url_count['max'], $end - $beg);
if ($url_count['max'] == ($end-$beg)) {
$url_count['max_name'] = urldecode(urldecode($value));
}
set_time_limit(0); //reset any time limit on php script
usleep(100000); // wait for 0.1 seconds
}
//Timer has gone over the limit; end this run and restart
if ((round($t_total/1000000.0, 4) >= $system_timeout) || (round($end - $start,2) >= $php_timeout)) {
$url_count['iterations']++;
StoreArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename); //Write State To Files
Sleep(1); //Wait One Second
CallSelfSystem($command); //Call Self
tolet(); //Flush Buffers
ob_end_flush(); //End Output
exit(); //Kill This Process
}
}
//While Loop Done; Crawling of stie is Complete
//Output Interesting Stats
echo count($url_done) .", ". count($url_list) .", ". $url_count['count']. "\n";
echo "Total Time: " . round(($end - $url_count['time'])/60.0,2) . " Minutes\n";
echo '';
tolet();//Flush Buffers
//Write To Log
$txt = '';
$txt .= '[' .date(DATE_RFC822). ']';
$txt .= "\t".'Crawled:' .count($url_done). '/' .$url_count['count']. '/' .count($url_list);
$txt .= "\t".'Total Time: ' .round(($end - $url_count['time'])/60.0,2). ' Minutes';
$txt .= "\t".'Script Iterations: ' .$url_count['iterations'];
$txt .= "\t".'Slowest Page: ' .number_format($url_count['max'], 2, '.', ''). " Seconds " .$url_count['max_name']. "\n";
file_put_contents($log_filename, $txt, FILE_APPEND);
DeleteTempFiles($url_list_filename, $url_done_filename, $url_count_filename); //Indexing Complete, Kill Temp Files
ob_end_flush(); //End Output
exit(); //End Of Program
}
else {
//exit, script was called from user browser
echo "CRON Script can only be run via system";
exit();
}
}
else {
//remote IP not set, called local
FirstRun($script_url);
}
/**
* List of functions below
**/
//Web Crawler & Link Get
function GetLinksFromURL($url,$base_url) {
//download file
$var = file_get_contents($url);
//get all links
preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
//process links
$matches = $matches[1];
$temp_list = array();
foreach($matches as $var) {
//kill http://www.yoursite.com/ ($base_url)
$temp = str_replace($base_url,"",strstr($var, $base_url));
if (strlen($temp)>0) {
$var = $temp;
}
//kill any external links & pics
if (strlen(strstr($var, 'http://'))>0) { }
else if(strlen(strstr($var, 'www.'))>0) { }
else if(strlen(strstr($var, 'mailto:'))>0) { }
else if(strlen(strstr($var, '.jpg'))>0) { }
else if(strlen(strstr($var, '.JPG'))>0) { }
else if(strlen(strstr($var, '.gif'))>0) { }
else if(strlen(strstr($var, '.png'))>0) { }
else if(strlen(strstr($var, '#'))>0) { }
else if(strlen(strstr($var, '.xml'))>0) { }
//store url in array
else {
$temp_list[] = "http://".$base_url.$var;
}
}
unset ($var);
unset ($matches);
$matches = array();
//Round 2 of link checking
foreach($temp_list as $var) {
if (strlen(strstr($var, 'http://'.$base_url))<0) { }
else if (substr_count($var, '://')>1) {}
else {
$var = str_replace($base_url."/",$base_url,$var);
$matches[] = $var;
}
}
//remove duplicate links
return KillDuplicates($matches);
}
//Destroys duplicate entries in array
function KillDuplicates($array) {
return array_unique($array);
}
//System Timer
function my_microtime($precision = 4) {
return round(microtime(true),$precision);
}
//Clear Buffers
function tolet() {
ob_flush();
}
//Call Self via exec Call
function CallSelfSystem($command) {
$dump = array();
$start = my_microtime();
exec($command, $dump);
$end = my_microtime();
echo "\n" . $command;
echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (0.01) means async execution worked.";
echo '';
}
//Call Self via URL
Function FirstRun($script_url) {
//Output Info
echo "Running " .$script_url. "\n";
echo file_get_contents($script_url);
//end of jumpstart script
exit();
}
//File Functions
function StoreArrays ($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename) {
echo "\nwrote " . file_put_contents($url_list_filename, serialize($url_list)) . " bytes to \t" . $url_list_filename . "\n";
echo "wrote " . file_put_contents($url_done_filename, serialize($url_done)) . " bytes to \t" . $url_done_filename . "\n";
echo "wrote " . file_put_contents($url_count_filename, serialize($url_count)) . " bytes to \t" . $url_count_filename . "\n";
}
function ReadFilesIntoArrays(&$url_list,$url_list_filename, &$url_done,$url_done_filename, &$url_count,$url_count_filename) {
if (file_exists($url_list_filename)) {$url_list = unserialize(file_get_contents($url_list_filename));}
if (file_exists($url_done_filename)) {$url_done = unserialize(file_get_contents($url_done_filename));}
if (file_exists($url_count_filename)) {$url_count = unserialize(file_get_contents($url_count_filename));}
}
function DeleteTempFiles($url_list_filename, $url_done_filename, $url_count_filename) {
@unlink($url_list_filename);
@unlink($url_done_filename);
@unlink($url_count_filename);
}