/dev/null &"; //linux only http://www.sitecrafting.com/blog/to-run-php-code-in/ $command = $system_php . $script_location . $async_call; //Runline /** * Crawler Variables */ $url_list = array("http://".$base_url); //"prime the pump" $url_done = array(); $url_count = array('count' => 0, 'time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => ''); //timer for total run $t_after = 0; $t_total = 0; /** * File Variables */ $url_crawler_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "temp_junk_crawler.txt"; $log_filename = $_SERVER['DOCUMENT_ROOT'] . "/" .$dir. "log.txt"; /*** Start To Do Stuff! ***/ //access control if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) { //remote & local IP is set if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($bypass_ip == $_SERVER['REMOTE_ADDR'])) { //run crawler, script called from self or white listed ip ReadFileIntoArrays($url_list, $url_done, $url_count, $url_crawler_filename); //Read Previous Run In //Output Some Info ob_start(); $end = microtime(true); echo '

';
    echo 'Read Data Files From: ' . $url_crawler_filename . "\n";
    echo 'Total Time: ' . round($end - $url_count['time'],2). " Seconds. URL's in Queue: " .count($url_list). "\n\n";
    ob_flush(); //Flush Buffers


    $start = microtime(true); //start timer for this run
    //Do The List
    //Run though every value in array; works with a growing array
    while (list(, $value) = each($url_list)) {
      //URL has not been crawled
      if (in_array($value, $url_done) == FALSE) {
        //crawlers
        $url_list = array_unique(array_merge($url_list, GetLinksFromURL($value, $base_url))); //Crawl URL & Add to list
        $url_done[] = $value; //add crawled url to done list
        $url_done = array_unique($url_done); //remove dups from crawled list
        
        //timers
        $beg = $end;
        $end = microtime(true); //stop clock
        $dat = getrusage(); //stop clock
        $t_after = $dat['ru_utime.tv_usec'];
        $t_total += $t_after;
        
        //Output Info
        echo $url_count['count'] . ", ";
        echo number_format($end - $start, 2, '.', ''). ", ";
        echo number_format($end - $beg, 3, '.', ''). ", ";
        echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
        echo "Crawling " .urldecode(urldecode($value)). "\n";
        
        ob_flush(); //Flush Buffers
        $url_count['count']++; //Increment URL Crawled Counter
        $url_count['max'] = max($url_count['max'], $end - $beg);
        if ($url_count['max'] == ($end-$beg)) {
          $url_count['max_name'] = urldecode(urldecode($value));
        }
        set_time_limit(0); //reset any time limit on php script
        usleep(100000); // wait for 0.1 seconds
      }
      //Timer has gone over the limit; end this run and restart
      if ((round($t_total/1000000.0, 4) >= $system_timeout) || (round($end - $start,2) >= $php_timeout)) {
        $url_count['iterations']++;
        StoreArrays($url_list, $url_done, $url_count, $url_crawler_filename); //Write State To Files
        Sleep(1); //Wait One Second
        
        CallSelfSystem($command); //Call Self
        
        ob_flush(); //Flush Buffers
        ob_end_flush(); //End Output 
        exit(); //Kill This Process
      }
    }
    //While Loop Done; Crawling of stie is Complete

    //Output Interesting Stats
    echo count($url_done) .", ". count($url_list) .", ". $url_count['count']. "\n";
    echo "Total Time: " . round(($end - $url_count['time'])/60.0,2) . " Minutes\n";
    echo '

'; ob_flush();//Flush Buffers //Write To Log $txt = ''; $txt .= '[' .date(DATE_RFC822). ']'; $txt .= "\t".'Crawled:' .count($url_done). '/' .$url_count['count']. '/' .count($url_list); $txt .= "\t".'Total Time: ' .round(($end - $url_count['time'])/60.0,2). ' Minutes'; $txt .= "\t".'Script Iterations: ' .$url_count['iterations']; $txt .= "\t".'Slowest Page: ' .number_format($url_count['max'], 2, '.', ''). " Seconds " .$url_count['max_name']. "\n"; file_put_contents($log_filename, $txt, FILE_APPEND); @unlink($url_crawler_filename); //Indexing Complete, Kill Temp Files ob_end_flush(); //End Output exit(); //End Of Program } else { //exit, script was called from user browser echo "CRON Script can only be run via system"; exit(); } } else { //remote IP not set, called local FirstRun($script_url); } exit(); /** * List of functions below **/ //Web Crawler & Link Get function GetLinksFromURL($url,$base_url) { //download file $var = file_get_contents($url); //get all links preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches); //process links $matches = $matches[1]; $temp_list = array(); foreach($matches as $var) { //kill http://www.yoursite.com/ ($base_url) $temp = str_replace($base_url,"",strstr($var, $base_url)); if (strlen($temp)>0) { $var = $temp; } //kill any external links & pics if (strlen(strstr($var, 'http://'))>0) { } else if(strlen(strstr($var, 'www.'))>0) { } else if(strlen(strstr($var, 'mailto:'))>0) { } else if(strlen(strstr($var, '.jpg'))>0) { } else if(strlen(strstr($var, '.JPG'))>0) { } else if(strlen(strstr($var, '.gif'))>0) { } else if(strlen(strstr($var, '.png'))>0) { } else if(strlen(strstr($var, '#'))>0) { } else if(strlen(strstr($var, '.xml'))>0) { } //store url in array else { $temp_list[] = "http://".$base_url.$var; } } unset ($var); unset ($matches); $matches = array(); //Round 2 of link checking foreach($temp_list as $var) { if (strlen(strstr($var, 'http://'.$base_url))<0) { } else if (substr_count($var, '://')>1) {} else { $var = str_replace($base_url."/",$base_url,$var); $matches[] = $var; } } //remove duplicate links return array_unique($matches); } //Call Self via exec Call function CallSelfSystem($command) { $dump = array(); $start = microtime(true); exec($command, $dump); $end = microtime(true); echo "\n" . $command; echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (0.01) means async execution worked."; echo ''; } //Call Self via URL Function FirstRun($script_url) { //Output Info echo "Running " .$script_url. "\n"; echo file_get_contents($script_url); //end of jumpstart script exit(); } //File Functions function StoreArrays ($url_list, $url_done, $url_count, $url_crawler_filename) { $temp_array = array(); $temp_array[] = $url_list; $temp_array[] = $url_done; $temp_array[] = $url_count; echo "\nwrote " . file_put_contents($url_crawler_filename, serialize($temp_array)) . " bytes to \t" . $url_crawler_filename . "\n"; } function ReadFileIntoArrays(&$url_list, &$url_done, &$url_count, $url_crawler_filename) { $temp_array = array(); if (file_exists($url_crawler_filename)) { $temp_array = unserialize(file_get_contents($url_crawler_filename)); $url_list = array_shift($temp_array); $url_done = array_shift($temp_array); $url_count = array_shift($temp_array); } }