<?php

// function calls for script
init_variables();
if (!check_credentials()) {
  echo 'CRON Script can only be run via system <br>';
  exit();
}
load_settings();
print_stats_top();
crawler_loop(microtime(true), microtime(true), 0);


/**
 * Set scripts variables.
 */
function init_variables() {
  // Set PHP INI Variables
  ini_set('memory_limit', '256M');
  ini_set('max_execution_time', 360);
  ini_set('output_buffering', 'off');

  Global $url_base, $url_prefix, $ip_address_bypass, $script_name, $script_subdir, $system_php_interperter, $system_cwd, $system_temp_file, $system_log_file, $php_timeout, $cpu_timeout;

  // www.  or ___. or leave blank
  $url_prefix = 'www.';
  // example.com/  no http:// no www. must have trailing /
  $url_base = 'example.com/';
  // sub directory on server, leave blank if this script is in root
  $script_subdir = 'cache/';
  // name of this file
  $script_name = 'boost_crawler.php';
  
  // location of php interperter: /web/cgi-bin/php5  /usr/bin/php
  $system_php_interperter = '/web/cgi-bin/php5';
  // current working dir of webroot: $HOME/html/  /var/www/vhosts/example.com/
  $system_cwd = '$HOME/html/';
  // allow this ip to run script: 127.0.0.1
  $ip_address_bypass = '127.0.0.1';
  
  // name of temp file to hold array
  $system_temp_file = 'temp_crawler_settings.txt';
  // name of log file
  $system_log_file = 'log.txt';
  
  // PHP Script timeout
  $php_timeout = 120;
  // max excution time of Script (CPU)
  $cpu_timeout = 120; 
}

/**
 * Read in temp file.
 */
function load_settings() {
  // variables needed
  Global $system_temp_file, $url_base, $url_prefix;
  // variables set
  Global $urls_all, $urls_crawled, $script_settings;
  
  $temp_array = array();
  if (file_exists($system_temp_file)) {
    $temp_array = unserialize(file_get_contents($system_temp_file));
    $urls_all = array_shift($temp_array);
    $urls_crawled = array_shift($temp_array);
    $script_settings = array_shift($temp_array);
  }
  else {
    $urls_all = array("http://" . $url_prefix . $url_base); //"prime the pump"
    $urls_crawled = array();
    $script_settings = array('time' => microtime(true), 'iterations' => 0, 'max' => 0, 'max_name' => '');
  }
}

/**
 * Check that script should start to crawl.
 */
function check_credentials() {
  Global $url_prefix, $url_base, $script_subdir, $script_name, $ip_address_bypass;

  if (isset($_SERVER['REMOTE_ADDR']) && isset($_SERVER['SERVER_ADDR'])) {
    //remote & local IP is set
    if (($_SERVER['REMOTE_ADDR'] == $_SERVER['SERVER_ADDR']) || ($ip_address_bypass == $_SERVER['REMOTE_ADDR'])) {
     return TRUE;
    }
    else {
      return FALSE;
    }
  }
  //Call Self via URL if loaded via system call.
  else {
    file_get_contents('http://' . $url_prefix . $url_base . $script_subdir . $script_name);
    exit();
  }
}

/**
 * Output usefull info.
 */
function print_stats_top() {
  Global $system_temp_file, $urls_all, $urls_crawled, $script_settings;

  echo '<pre>';
  echo 'Get Data From: ' . $system_temp_file . "\n";
  echo 'Total Crawler Time So Far: ' . round(microtime(true) - $script_settings['time'],2) . ' Seconds. ' . count($urls_all) . '/' . count($urls_crawled) . ' Pages Crawled.' . "\n\n";
}

/**
 * Loop that crawls site.
 */
function crawler_loop($start, $end, $t_total) {
  Global $urls_all, $urls_crawled, $php_timeout, $cpu_timeout, $script_settings;
  
  // set counter
  $starting_count = count($urls_crawled);
  //slice the urls_all array
  $sliced_urls_list = array_slice($urls_all, $starting_count);
  
  $loop_counter = 0;
  
  foreach ($sliced_urls_list as $url) {
    $loop_counter++;
    // crawl url
    get_new_urls($url);
    
    // print output to screen
    print_crawler_stats($start, $end, $t_total, $loop_counter, $starting_count, $url);

    //Timer has gone over the limit; end this run and restart
    if ((round($t_total/1000000.0, 4) >= $cpu_timeout) || (round($end - $start,2) >= $php_timeout)) {
      $script_settings['iterations']++;
      write_temp_file();
      restart_script();
    }
    else {
      // wait for 0.1 seconds
      usleep(100000);
    }
    // write file every 100 urls crawled
    if(($loop_counter % 100) == 0) {
       write_temp_file();
    }
  }
  write_temp_file();
  
  // if array count doesn't match, there's more to crawl.
  if (count($urls_all) != count($urls_crawled)) {
    crawler_loop($start, $end, $t_total);
  }
  else {
    end_of_crawler();
  }
}

/**
 * Adds new url's to arrays.
 */
function get_new_urls($url) {
  Global $urls_all, $urls_crawled;
  // crawl url
  $newlinks = load_page($url);
  // add crawled url to done list
  $urls_crawled[] = $url;
  foreach($newlinks as $newurl) {
    // add links to the "to-do" array
    $urls_all[] = $newurl;
  }
}

/**
 * Outputs current stats to screen.
 */
function print_crawler_stats(&$start, &$end, &$t_total, $loop_counter, $starting_count, $url) {
    Global $script_settings;

  // Timers
  $beg = $end;
  $end = microtime(true); //stop clock
  $dat = getrusage(); //stop clock
  $t_after = $dat['ru_utime.tv_usec'];
  $t_total += $t_after;
  
  // Output Info
  echo ($loop_counter+$starting_count) . ", ";
  echo number_format($end - $start, 2, '.', ''). ", ";
  echo number_format($end - $beg, 3, '.', ''). ", ";
  echo number_format($t_total/1000000.0, 2, '.', '') . " seconds, \t";
  echo "Crawling " .urldecode(urldecode($url)). "\n";
  
  // Set Max Stat
  if (($end-$beg) > $script_settings['max']) {
    $script_settings['max'] = ($end-$beg);
    $script_settings['max_name'] = urldecode(urldecode($url));
  }
  flush();
  set_time_limit(0); //reset any time limit on php script
}

/**
 * Writes settings to local file.
 */
function write_temp_file() {
  Global $urls_all, $urls_crawled, $script_settings, $php_timeout, $cpu_timeout, $system_temp_file;
  echo 'writing';
  $urls_all = array_unique($urls_all);
  $urls_crawled = array_unique($urls_crawled);
  
  $temp_array = array();
  $temp_array[] = $urls_all;
  $temp_array[] = $urls_crawled;
  $temp_array[] = $script_settings;
  $s = serialize($temp_array);
  echo '... ';
  echo 'wrote ' . file_put_contents($system_temp_file, $s) . " bytes to \t" . $system_temp_file . "\n";
}

/**
 * Restarts script.
 */
function restart_script() {
  global $system_php_interperter, $system_cwd, $script_subdir, $script_name;

  Sleep(1); //Wait One Second
  $command = $system_php_interperter . ' "' . $system_cwd . $script_subdir . $script_name . '" > /dev/null &';
  call_self_via_system($command); //Call Self

  exit(); //Kill This Process
}

/**
 * Calls self via system call.
 */
function call_self_via_system($command) {
  $dump = array();
  $start = microtime(true);
  exec($command, $dump);
  $end = microtime(true);
  echo "\n" . $command;
  echo "\nTime to Run Command " . round($end - $start,4) . " seconds. Low number (less then a second) means async execution worked most likely.";
  echo "\n" . print_r($dump, true);
  echo '</pre>';
}

/**
 * Outputs final stats and writes to log file.
 */
function end_of_crawler() {
  Global $urls_all, $urls_crawled, $script_settings, $cpu_timeout, $system_temp_file, $system_log_file;
  //While Loop Done; Crawling of stie is Complete

  //Output Interesting Stats
  echo count($urls_all) . ", " . count($urls_crawled) . "\n";
  echo "Total Time: " . round((microtime(true) - $script_settings['time'])/60.0,2) . " Minutes\n";
  echo '</pre>';
  ob_flush();//Flush Buffers

  //Write To Log
  $txt  =  '[' . date(DATE_RFC822) . ']';
  $txt .= "\t" . 'Crawled:' . count($urls_crawled) . '/' . count($urls_all);
  $txt .= "\t" . 'Total Time: ' . round((microtime(true) - $script_settings['time'])/60.0,2) . ' Minutes';
  $txt .= "\t" . 'Script Iterations: ' . $script_settings['iterations'];
  $txt .= "\t" . 'Slowest Page: ' . number_format($script_settings['max'], 2, '.', '') . " Seconds " . $script_settings['max_name'] . "\n";
  file_put_contents($system_log_file, $txt, FILE_APPEND);
  @unlink($system_temp_file); //Indexing Complete, Kill Temp Files

  exit(); //End Of Program
}

/**
 * Web Crawler
 */
function load_page($url) {
  Global $url_base, $url_prefix;
  
  //download file
  $var = file_get_contents($url);
  //get all links
  preg_match_all("/a[\s]+[^>]*?href[\s]?=[\s\"\']+(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $var, $matches);
  
  //process links
  $matches = $matches[1];
  $temp_list = array();
  foreach($matches as $page) {    
    //kill http://www.yoursite.com/ ($base_url)
    $temp = str_replace($url_base,"",strstr($page, $url_base));
    if (strlen($temp)>0) {
      $page = $temp;
    }
    
    //kill any external links & pics
    if (!( strlen(strstr($page, 'http://'))>0
        || strlen(strstr($page, 'www.'))>0
        || strlen(strstr($page, 'mailto:'))>0
        || strlen(strstr($page, '.jpg'))>0
        || strlen(strstr($page, '.JPG'))>0
        || strlen(strstr($page, '.gif'))>0
        || strlen(strstr($page, '.png'))>0
        || strlen(strstr($page, '#'))>0
        || strlen(strstr($page, '.xml'))>0
    )) {
      //store url in array
      $temp_list[] = "http://" . $url_prefix . $url_base . $page;
    }
  }

  //Round 2 of link checking
  $matches = array();
  foreach($temp_list as $page) {
    if (!(strlen(strstr($page, 'http://' . $url_prefix . $base_url))<0 || substr_count($page, '://')>1)) {
      $page = str_replace($url_base . "/", $url_base, $page);
      $matches[] = $page;
    }
  }
  
  //remove duplicate links
  return array_unique($matches);
}