/dev/null &"; //linux only http://www.sitecrafting.com/blog/to-run-php-code-in/ $command = $system_php . $script_location . $async_call; //Runline /** * Crawler Variables */ $url_list = array("http://".$base_url); //"prime the pump" $url_done = array(); $url_count = array('count' => 0, 'time' => my_microtime()); //timer for total run /** * File Variables */ $url_done_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_done.txt"; $url_list_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir ."temp_junk__url_list.txt"; $url_count_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_count.txt"; $log_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "log.txt"; //Make script more robust ignore_user_abort(1); // run script in background set_time_limit(0); // run script forever //access control if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($bypass_ip == $_SERVER['REMOTE_ADDR'])) { //run crawler, script called from self or white listed ip /*** Start To Do Stuff! ***/ ReadFilesIntoArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count, $url_count_filename); //Read Previous Run In //Output Some Info ob_start(); $end = my_microtime(); echo '
';
    echo 'Read Data Files From: ' . $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "\n";
    echo 'Total Time: ' . round($end - $url_count['time'],2). " Seconds. URL's in Queue: " .count($url_list). "\n";
    tolet(); //Flush Buffers


    $start = my_microtime(); //start timer for this run
    //Do The List
    //Run though every value in array; works with a growing array
    while (list(, $value) = each($url_list)) {
      //URL has not been crawled
      if (in_array($value, $url_done) == FALSE) {
        $temp_list = GetLinksFromURL($value, $base_url); //return array of links from url
        $tempA = array_merge($url_list, $temp_list); //merge arrays
        unset($url_list); //clear old array
        $url_list = KillDuplicates($tempA); //remove dups in array
        $url_done[] = $value; //add crawled url to done list
        $url_done = KillDuplicates($url_done); //remove dups from crawled list
        $end = my_microtime(); //stop clock
        
        //Output Info
        echo $url_count['count'] . ", ";
        echo round($end - $start,2) . " seconds, \t";
        echo "Crawling " .urldecode(urldecode($value)). "\n";
        
        tolet(); //Flush Buffers
        $url_count['count']++; //Increment URL Crawled Counter
        usleep(100000); // wait for 0.1 seconds
      }
      //Timer has gone over the limit; end this run and restart
      if (round($end - $start,2) > $php_timeout) {
        StoreArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename); //Write State To Files
        Sleep(1); //Wait One Second
        
        CallSelfSystem($command); //Call Self
        
        tolet(); //Flush Buffers
        ob_end_flush(); //End Output 
        exit(); //Kill This Process
      }
    }
    //While Loop Done; Crawling of stie is Complete
    DelTempFiles($url_list_filename, $url_done_filename, $url_count_filename); //Indexing Complete, Kill Temp Files

    //Output Interesting Stats
    echo count($url_done) .", ". count($url_list) .", ". $url_count['count']. "\n";
    echo "Total Time: " . round(($end - $url_count['time'])/60.0,2) . " Minutes\n";
    echo '
'; tolet();//Flush Buffers //Write To Log $txt = ''; $txt .= '[' .date(DATE_RFC822). ']'; $txt .= "\t".'Crawled:' .count($url_done). '/' .$url_count['count']. '/' .count($url_list); $txt .= "\t".'Total Time: ' .round(($end - $url_count['time'])/60.0,2). ' Minutes'."\n"; WriteToLog($log_filename, $txt); ob_end_flush(); //End Output exit(); //End Of Program } else { //exit, script was called from user browser echo "CRON Script can only be run via system"; exit(); } } else { //remote IP not set, called local FirstRun($script_url); } /** * List of functions below **/ //Web Crawler & Link Get function GetLinksFromURL($url,$base_url) { //download file $var = file_get_contents($url); //get all links preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches); //process links $matches = $matches[1]; $temp_list = array(); foreach($matches as $var) { //kill http://www.yoursite.com/ ($base_url) $temp = str_replace($base_url,"",strstr($var, $base_url)); if (strlen($temp)>0) { $var = $temp; } //kill any external links & pics if (strlen(strstr($var, 'http://'))>0) { } else if(strlen(strstr($var, 'www.'))>0) { } else if(strlen(strstr($var, 'mailto:'))>0) { } else if(strlen(strstr($var, '.jpg'))>0) { } else if(strlen(strstr($var, '.JPG'))>0) { } else if(strlen(strstr($var, '.gif'))>0) { } else if(strlen(strstr($var, '.png'))>0) { } else if(strlen(strstr($var, '#'))>0) { } else if(strlen(strstr($var, '.xml'))>0) { } //store url in array else { $temp_list[] = "http://".$base_url.$var; } } unset ($var); unset ($matches); $matches = array(); //Round 2 of link checking foreach($temp_list as $var) { if (strlen(strstr($var, 'http://'.$base_url))<0) { } else if (substr_count($var, '://')>1) {} else { $var = str_replace($base_url."/",$base_url,$var); $matches[] = $var; } } //remove duplicate links return KillDuplicates($matches); } //Destroys duplicate entries in array function KillDuplicates($array) { return array_unique($array); } //System Timer function my_microtime($precision = 4) { return round(microtime(true),$precision); } //Clear Buffers function tolet() { ob_flush(); } //Call Self via browser function CallSelfBrowser() { echo ''; echo ""; } //Call Self via exec Call function CallSelfSystem($command) { $dump = array(); $start = my_microtime(); exec($command, $dump); $end = my_microtime(); echo "\n" . $command; echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (0.01) means async execution worked."; echo ''; } //Call Self via URL Function FirstRun($script_url) { //Output Info echo "Running " .$script_url. "\n"; echo file_get_contents($script_url); //end of jumpstart script exit(); } //File Functions function StoreArrays ($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename) { $f_handle = fopen($url_list_filename, 'wb'); fwrite($f_handle, serialize($url_list)); fclose($f_handle); unset ($f_handle); $f_handle = fopen($url_done_filename, 'wb'); fwrite($f_handle, serialize($url_done)); fclose($f_handle); unset ($f_handle); $f_handle = fopen($url_count_filename, 'wb'); fwrite($f_handle, serialize($url_count)); fclose($f_handle); unset ($f_handle); } function ReadFilesIntoArrays(&$url_list,$url_list_filename, &$url_done,$url_done_filename, &$url_count,$url_count_filename) { if (file_exists($url_list_filename)) { $f_handle = fopen($url_list_filename, 'rb'); $url_list = unserialize(fread($f_handle, filesize($url_list_filename))); fclose($f_handle); } if (file_exists($url_done_filename)) { $f_handle = fopen($url_done_filename, 'rb'); $url_done = unserialize(fread($f_handle, filesize($url_done_filename))); fclose($f_handle); } if (file_exists($url_count_filename)) { $f_handle = fopen($url_count_filename, 'rb'); $url_count = unserialize(fread($f_handle, filesize($url_count_filename))); fclose($f_handle); } } function DelTempFiles($url_list_filename, $url_done_filename, $url_count_filename) { if (file_exists($url_list_filename)) {unlink($url_list_filename);} if (file_exists($url_done_filename)) {unlink($url_done_filename);} if (file_exists($url_count_filename)) {unlink($url_count_filename);} } function WriteToLog($log_filename, $txt) { $f_handle = fopen($log_filename, 'a'); fwrite($f_handle, $txt); fclose($f_handle); unset ($f_handle); }