/dev/null &"; //linux only http://www.sitecrafting.com/blog/to-run-php-code-in/ $command = $system_php . $script_location . $async_call; //Runline /** * Crawler Variables */ $url_list = array("http://".$base_url); //"prime the pump" $url_done = array(); $url_count = array('count' => 0, 'time' => my_microtime(), 'iterations' => 0, 'max' => 0, 'max_name' => ''); //timer for total run $t_after = 0; $t_total = 0; /** * File Variables */ $url_done_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_done.txt"; $url_list_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir ."temp_junk__url_list.txt"; $url_count_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk__url_count.txt"; $log_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "log.txt"; //access control if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($bypass_ip == $_SERVER['REMOTE_ADDR'])) { //run crawler, script called from self or white listed ip /*** Start To Do Stuff! ***/ ReadFilesIntoArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count, $url_count_filename); //Read Previous Run In //Output Some Info ob_start(); $end = my_microtime(); echo '
';
    echo 'Read Data Files From: ' . $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "\n";
    echo 'Total Time: ' . round($end - $url_count['time'],2). " Seconds. URL's in Queue: " .count($url_list). "\n\n";
    tolet(); //Flush Buffers


    $start = my_microtime(); //start timer for this run
    //Do The List
    //Run though every value in array; works with a growing array
    while (list(, $value) = each($url_list)) {
      //URL has not been crawled
      if (in_array($value, $url_done) == FALSE) {
        $temp_list = GetLinksFromURL($value, $base_url); //return array of links from url
        $tempA = array_merge($url_list, $temp_list); //merge arrays
        unset($url_list); //clear old array
        $url_list = KillDuplicates($tempA); //remove dups in array
        $url_done[] = $value; //add crawled url to done list
        $url_done = KillDuplicates($url_done); //remove dups from crawled list
        $beg = $end;
        $end = my_microtime(); //stop clock
        $dat = getrusage(); //stop clock
        $t_after = $dat['ru_utime.tv_usec'];
        $t_total += $t_after;
        
        //Output Info
        echo $url_count['count'] . ", ";
        echo number_format($end - $start, 2, '.', ''). ", ";
        echo number_format($end - $beg, 3, '.', ''). ", ";
        echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
        echo "Crawling " .urldecode(urldecode($value)). "\n";
        
        tolet(); //Flush Buffers
        $url_count['count']++; //Increment URL Crawled Counter
        $url_count['max'] = max($url_count['max'], $end - $beg);
        if ($url_count['max'] == ($end-$beg)) {
          $url_count['max_name'] = urldecode(urldecode($value));
        }
        set_time_limit(0); //reset any time limit on php script
        usleep(100000); // wait for 0.1 seconds
      }
      //Timer has gone over the limit; end this run and restart
      if ((round($t_total/1000000.0, 4) >= $system_timeout) || (round($end - $start,2) >= $php_timeout)) {
        $url_count['iterations']++;
        StoreArrays($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename); //Write State To Files
        Sleep(1); //Wait One Second
        
        CallSelfSystem($command); //Call Self
        
        tolet(); //Flush Buffers
        ob_end_flush(); //End Output 
        exit(); //Kill This Process
      }
    }
    //While Loop Done; Crawling of stie is Complete

    //Output Interesting Stats
    echo count($url_done) .", ". count($url_list) .", ". $url_count['count']. "\n";
    echo "Total Time: " . round(($end - $url_count['time'])/60.0,2) . " Minutes\n";
    echo '
'; tolet();//Flush Buffers //Write To Log $txt = ''; $txt .= '[' .date(DATE_RFC822). ']'; $txt .= "\t".'Crawled:' .count($url_done). '/' .$url_count['count']. '/' .count($url_list); $txt .= "\t".'Total Time: ' .round(($end - $url_count['time'])/60.0,2). ' Minutes'; $txt .= "\t".'Script Iterations: ' .$url_count['iterations']; $txt .= "\t".'Slowest Page: ' .number_format($url_count['max'], 2, '.', ''). " Seconds " .$url_count['max_name']. "\n"; file_put_contents($log_filename, $txt, FILE_APPEND); DeleteTempFiles($url_list_filename, $url_done_filename, $url_count_filename); //Indexing Complete, Kill Temp Files ob_end_flush(); //End Output exit(); //End Of Program } else { //exit, script was called from user browser echo "CRON Script can only be run via system"; exit(); } } else { //remote IP not set, called local FirstRun($script_url); } /** * List of functions below **/ //Web Crawler & Link Get function GetLinksFromURL($url,$base_url) { //download file $var = file_get_contents($url); //get all links preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches); //process links $matches = $matches[1]; $temp_list = array(); foreach($matches as $var) { //kill http://www.yoursite.com/ ($base_url) $temp = str_replace($base_url,"",strstr($var, $base_url)); if (strlen($temp)>0) { $var = $temp; } //kill any external links & pics if (strlen(strstr($var, 'http://'))>0) { } else if(strlen(strstr($var, 'www.'))>0) { } else if(strlen(strstr($var, 'mailto:'))>0) { } else if(strlen(strstr($var, '.jpg'))>0) { } else if(strlen(strstr($var, '.JPG'))>0) { } else if(strlen(strstr($var, '.gif'))>0) { } else if(strlen(strstr($var, '.png'))>0) { } else if(strlen(strstr($var, '#'))>0) { } else if(strlen(strstr($var, '.xml'))>0) { } //store url in array else { $temp_list[] = "http://".$base_url.$var; } } unset ($var); unset ($matches); $matches = array(); //Round 2 of link checking foreach($temp_list as $var) { if (strlen(strstr($var, 'http://'.$base_url))<0) { } else if (substr_count($var, '://')>1) {} else { $var = str_replace($base_url."/",$base_url,$var); $matches[] = $var; } } //remove duplicate links return KillDuplicates($matches); } //Destroys duplicate entries in array function KillDuplicates($array) { return array_unique($array); } //System Timer function my_microtime($precision = 4) { return round(microtime(true),$precision); } //Clear Buffers function tolet() { ob_flush(); } //Call Self via exec Call function CallSelfSystem($command) { $dump = array(); $start = my_microtime(); exec($command, $dump); $end = my_microtime(); echo "\n" . $command; echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (0.01) means async execution worked."; echo ''; } //Call Self via URL Function FirstRun($script_url) { //Output Info echo "Running " .$script_url. "\n"; echo file_get_contents($script_url); //end of jumpstart script exit(); } //File Functions function StoreArrays ($url_list,$url_list_filename, $url_done,$url_done_filename, $url_count,$url_count_filename) { echo "\nwrote " . file_put_contents($url_list_filename, serialize($url_list)) . " bytes to \t" . $url_list_filename . "\n"; echo "wrote " . file_put_contents($url_done_filename, serialize($url_done)) . " bytes to \t" . $url_done_filename . "\n"; echo "wrote " . file_put_contents($url_count_filename, serialize($url_count)) . " bytes to \t" . $url_count_filename . "\n"; } function ReadFilesIntoArrays(&$url_list,$url_list_filename, &$url_done,$url_done_filename, &$url_count,$url_count_filename) { if (file_exists($url_list_filename)) {$url_list = unserialize(file_get_contents($url_list_filename));} if (file_exists($url_done_filename)) {$url_done = unserialize(file_get_contents($url_done_filename));} if (file_exists($url_count_filename)) {$url_count = unserialize(file_get_contents($url_count_filename));} } function DeleteTempFiles($url_list_filename, $url_done_filename, $url_count_filename) { @unlink($url_list_filename); @unlink($url_done_filename); @unlink($url_count_filename); }