--- /dev/null	2010-01-28 00:00:21.605006597 -0600
+++ usage/project-usage-common.inc	2010-02-03 13:06:16.000000000 -0600
@@ -0,0 +1,182 @@
+<?php
+
+// ------------------------------------------------------------
+// Initialization
+// (Real work begins here, nothing else to customize)
+// ------------------------------------------------------------
+
+// First of all, ensure we are being run from the command line.
+if (php_sapi_name() != 'cli') {
+  die('This script is designed to run via the command line.');
+}
+
+// Safety harness off.
+ini_set('memory_limit', '-1');
+
+// Check if all required variables are defined.
+$vars = array(
+  'DRUPAL_ROOT' => DRUPAL_ROOT,
+  'SITE_NAME' => SITE_NAME,
+  'UPDATES_URL' => UPDATES_URL,
+  'UPDATES_GLOB' => UPDATES_GLOB,
+  'UPDATES_LOADER' => UPDATES_LOADER,
+  'STATS_EXPIRE_WEEKS' => STATS_EXPIRE_WEEKS,
+);
+$fatal_err = FALSE;
+foreach ($vars as $name => $val) {
+  if (empty($val)) {
+    print "ERROR: \"$name\" constant not defined, aborting\n";
+    $fatal_err = TRUE;
+  }
+}
+if ($fatal_err) {
+  exit(1);
+}
+
+$script_name = $argv[0];
+
+// Setup variables for Drupal bootstrap
+$_SERVER['HTTP_HOST'] = SITE_NAME;
+$_SERVER['REMOTE_ADDR'] = '127.0.0.1';
+$_SERVER['REQUEST_URI'] = '/' . $script_name;
+$_SERVER['SCRIPT_NAME'] = '/' . $script_name;
+$_SERVER['PHP_SELF'] = '/' . $script_name;
+$_SERVER['SCRIPT_FILENAME'] = $_SERVER['PWD'] .'/'. $script_name;
+$_SERVER['PATH_TRANSLATED'] = $_SERVER['SCRIPT_FILENAME'];
+
+if (!chdir(DRUPAL_ROOT)) {
+  print "ERROR: Can't chdir(DRUPAL_ROOT), aborting.\n";
+  exit(1);
+}
+// Make sure our umask is sane for generating directories and files.
+umask(022);
+
+require_once 'includes/bootstrap.inc';
+
+drupal_bootstrap(DRUPAL_BOOTSTRAP_FULL);
+//drupal_bootstrap(DRUPAL_BOOTSTRAP_DATABASE);
+
+if (!module_exists('project_usage')) {
+  watchdog('project_usage', 'project_usage module is not active, aborting statistics processing!', array(), WATCHDOG_ERROR);
+  print t('project_usage module is not active, aborting statistics processing!');
+  exit(1);
+}
+
+// Load the API functions we need for manipulating dates and timestamps.
+module_load_include('inc', 'project_usage', 'includes/date_api');
+
+/////////////////// COMMON FUNCTIONS ///////////////////
+
+/**
+ * Get a MongoDB reference.
+ */
+function project_usage_mongo() {
+  static $db;
+  if (!$db) {
+    $conn = new Mongo(); // @@@ Ask nnewton about enabling unix domain socket.
+    $db = $conn->selectDB("update-statistics");
+    // @@@ Check for success?
+  }
+  return $db;
+}
+
+/**
+ * Get a lookup table.
+ * Information about cores, projects and releases are loaded once and cached in
+ * memory for speed.
+ */
+function project_usage_lookup($type = 'terms') {
+  static $data = array();
+
+  if (empty($data)) {
+    // Terms
+    $db = project_usage_mongo();
+    $c = $db->selectCollection('terms');
+    $cursor = $c->find();
+    foreach ($cursor as $row) {
+      $data['terms'][$row['tid']] = $row['name'];
+      $data['rterms'][$row['name']] = $row['tid'];
+    }
+
+    // Projects
+    $c = $db->selectCollection('projects');
+    $cursor = $c->find();
+    foreach ($cursor as $row) {
+      $data['projdata'][$row['uri']] = $row['pid'];
+      $data['rprojdata'][$row['pid']] = $row['uri'];
+    }
+
+    // Releases
+    $c = $db->selectCollection('releases');
+    $cursor = $c->find();
+    foreach ($cursor as $r) {
+      $row = (object) $r;
+      $data['releasedata']["{$row->uri}:{$row->version}"] = array($row->pid, $row->nid, $row->tid);
+      $data['rreleasedata']["{$row->pid}:{$row->nid}"] = $row->version;
+    }
+  }
+  return isset($data[$type]) ? $data[$type] : FALSE;
+}
+
+/**
+ * Feed a batch of processed data into mongo.
+ */
+function project_usage_process_batch(&$c, $buffer, &$st) {
+  // Buffers are broken down by site so we don't have to jump back and forth
+  // between sites while talking to mongo. We can for the most part do things
+  // a site at a time.
+  foreach ($buffer as $site_key => $data) {
+    $lasthit = $data[count($data)-1];
+    $changed = FALSE;
+    $record = $c->findOne(array('site_key' => $site_key));
+    if (!$record) {
+      $record = array(
+        'site_key' => $site_key,
+        'modules' => array(),
+      );
+      $changed = TRUE;
+    }
+
+    // IP accounting is somewhat minimal, and is used to allow cleaning up
+    // after a remote site calls in repeatedly with different site_keys.
+    // This is pretty minimal -- it doesn't check for multiple ips, and only
+    // updates with the rest of the record.
+    $record['ip'] = $lasthit['ip'];
+
+    // Core also needs to be tracked in a similar fashion for determining the
+    // "correct" core for end-of-week.
+    $record['core'] = $lasthit['core'];
+
+    foreach ($data as $entry) {
+      if (!isset($record['modules'][$entry['project']])) {
+        // If it's the first time this week, just add it.
+        $record['modules'][$entry['project']] = array(
+          'timestamp' => $entry['timestamp'],
+          'release' => $entry['release'],
+          'core' => $entry['core'],
+        );
+        $changed = TRUE;
+      }
+      else {
+        // If the entry on file is older than the incoming one...
+        if ($record['modules'][$entry['project']]['timestamp'] < $entry['timestamp']) {
+          // Change it.
+          $record['modules'][$entry['project']] = array(
+            'timestamp' => $entry['timestamp'],
+            'release' => $entry['release'],
+            'core' => $entry['core'],
+          );
+          $changed = TRUE;
+        }
+      }
+    }
+    if ($changed) {
+      $c->save($record);
+      $st['change']++;
+    }
+    else {
+      $st['nochange']++;
+    }
+  }
+}
+
--- /dev/null	2010-01-28 00:00:21.605006597 -0600
+++ usage/project-usage-config.inc	2010-02-03 13:55:07.000000000 -0600
@@ -0,0 +1,30 @@
+<?php
+// ------------------------------------------------------------
+// Required customization
+// ------------------------------------------------------------
+
+// The root of your Drupal installation, so we can properly bootstrap
+// Drupal. This should be the full path to the directory that holds
+// your index.php file, the "includes" subdirectory, etc.
+define('DRUPAL_ROOT', '');
+
+// The name of your site. Required so that when we bootstrap Drupal in
+// this script, we find the right settings.php file in your sites folder.
+define('SITE_NAME', '');
+
+// The base url of the update server. (Used to match logfile entries.)
+define('UPDATES_URL', 'http://localhost/release-history/');
+
+// A shell glob to determine the files to process.
+define('UPDATES_GLOB', '/var/log/varnish/access*.log');
+
+// 'varnish' or 'sqldump', depending on whether we are loading
+// varnish logs or daydumps.
+define('UPDATES_LOADER', 'varnish');
+
+// How many weeks to keep statistical data around to allow integrating additional logs.
+// It doesn't affect the data stored in the main database, only how far back
+// we can import additional logfiles from.
+// An example of where this would be useful is recovering the log from a crashed
+// server and feeding it into the system a couple weeks after the fact.
+define('STATS_EXPIRE_WEEKS', 6);
--- /dev/null	2010-01-28 00:00:21.605006597 -0600
+++ usage/project-usage-load-sqldump.php	2010-02-03 13:17:06.000000000 -0600
@@ -0,0 +1,138 @@
+<?php
+
+require_once ('project-usage-config.inc');
+require_once ('project-usage-common.inc');
+
+// Run processing.
+project_usage_process_sqldump($_SERVER['argv'][1]);
+
+/**
+ * Process a varnish log file from a file.
+ */
+function project_usage_process_sqldump($filename) {
+  if (substr($filename, -3) == '.gz') {
+    $handle = fopen('compress.zlib://'. $filename, 'r');
+  }
+  else {
+    $handle = fopen($filename, 'r');
+  }
+  $rprojdata = project_usage_lookup('rprojdata');
+  $releasedata = project_usage_lookup('releasedata');
+  $rreleasedata = project_usage_lookup('rreleasedata');
+  $rterms = project_usage_lookup('rterms');
+
+  $db = project_usage_mongo();
+
+  echo "Begin processing log file.\n";
+
+  $time1 = microtime(TRUE);
+  $timestr = '';
+  $timestamp = 0;
+  $week = 0;
+  $ccol = 0; // Current collection
+  $c = FALSE;
+  $st = array('change' => 0, 'nochange' => 0, 'wrongcore' => 0, 'invalid' => 0, 'noproject' => 0, 'noversion' => 0, 'nullversion' => 0);
+  $buffer = array();
+  $bcnt = 0;
+  while ($line = fgets($handle)) {
+    //                                        timestamp   site_key                          pid     nid  tid  ip_addr
+    // INSERT INTO project_usage_day VALUES (1255824000,'00000000000000000000000000000000',106016,290380,87,' 10.111.222.77');
+    if (preg_match('@VALUES \(([^\)]+)\)@', $line, $matches)) {
+      $qdata = array();
+      list($time, $qdata['site_key'], $tproject, $treleasenid, $tcore, $ip) = explode(',', $matches[1]);
+
+      $qdata['site_key'] = trim($qdata['site_key'], "'");
+      $ip = trim($ip, "' ");
+
+      $project = $rprojdata[(int)$tproject];
+      if (!$project) {
+        $st['noproject']++;
+      }
+      $qdata['version'] = $rreleasedata["{$tproject}:{$treleasenid}"];
+      if (!$qdata['version']) {
+        if ($treleasenid == 0) {
+          $st['nullversion']++;
+        }
+        else {
+          $st['noversion']++;
+        }
+      }
+
+      // We are cheating on url parsing a bit for speed reasons. We have a lot
+      // of log to get through as fast as possible. None of the "sane" entries will
+      // have stuff like anchors in the url.
+
+      // This cache is awesome. Hit rate was ~92% on a one-day logfile.
+      // On my laptop, it reduces execution time of the parse loop from 170
+      // seconds to 70 seconds. I feel I will be unable to top that. ~bdragon
+      if ($timestr != $time) {
+        $timestr = $time;
+        $timestamp = $time; // 28 seconds saved in test run.
+        $week = project_usage_weekly_timestamp($timestamp); // 75 seconds saved(!)
+        // Change collections on a week boundary.
+        if ($ccol != $week) {
+          echo "Week change.\n";
+          $ccol = $week;
+          if ($bcnt > 0) {
+            // We need to process the buffer before switching weeks.
+            project_usage_process_batch($c, $buffer, $st);
+            $bcnt = 0;
+            $buffer = array();
+          }
+          $c = $db->selectCollection("$week");
+          $c->ensureIndex('site_key', TRUE);
+        }
+      }
+
+      if (!empty($qdata['site_key']) && !empty($qdata['version'])) {
+        $rkey = strtolower($project .':'. $qdata['version']);
+        if (isset($releasedata[$rkey]) && ($tcore == $releasedata[$rkey][2])) {
+          $rd = $releasedata[$rkey]; // array($row->pid, $row->nid, $row->version_api_tid);
+          $entry = array(
+            'site_key' => strtolower($qdata['site_key']),
+            'ip' => $ip,
+            'project' => $rd[0],
+            'core' => $rd[2],
+            'timestamp' => $timestamp,
+            'release' => $rd[1],
+          );
+
+          $bcnt++;
+          $buffer[$qdata['site_key']][] = $entry;
+
+          // Tune the 32768 as necessary. Sites checking in will generally leave their hits
+          // in the logfile in close proximity. We can often aggregate several of the hits
+          // from a site into a single update, if they appear in the same "chunk".
+          // This is purely for efficiency reasons, using the "wrong" number will at worst
+          // slow down statistics processing. 32768 was found to do reasonably well
+          // for updates.drupal.org.
+          if ($bcnt >= 32768) {
+            project_usage_process_batch($c, $buffer, $st);
+            $bcnt = 0;
+            $buffer = array();
+          }
+        }
+        else {
+          $st['wrongcore']++;
+        }
+      }
+      else {
+        $st['invalid']++;
+      }
+    }
+  }
+  if ($bcnt > 0) {
+    // Process the remaining partial buffer.
+    project_usage_process_batch($c, $buffer, $st);
+  }
+
+  fclose($handle);
+  $time2 = microtime(2);
+
+  echo "Processed file in ". ($time2 - $time1) ." seconds.\n";
+
+  print_r($st);
+
+  echo "\nEnd processing log file.\n";
+
+}
--- /dev/null	2010-01-28 00:00:21.605006597 -0600
+++ usage/project-usage-load-varnish.php	2010-02-03 13:15:44.000000000 -0600
@@ -0,0 +1,125 @@
+<?php
+
+require_once ('project-usage-config.inc');
+require_once ('project-usage-common.inc');
+
+// Run processing.
+project_usage_process_varnish($_SERVER['argv'][1]);
+
+/**
+ * Process a varnish log file from a file.
+ */
+function project_usage_process_varnish($filename) {
+  if (substr($filename, -3) == '.gz') {
+    $handle = fopen('compress.zlib://'. $filename, 'r');
+  }
+  else {
+    $handle = fopen($filename, 'r');
+  }
+  $releasedata = project_usage_lookup('releasedata');
+  $rterms = project_usage_lookup('rterms');
+
+  $db = project_usage_mongo();
+
+  echo "Begin processing log file.\n";
+
+  $time1 = microtime(TRUE);
+  $timestr = '';
+  $timestamp = 0;
+  $week = 0;
+  $ccol = 0; // Current collection
+  $c = FALSE;
+  $st = array('change' => 0, 'nochange' => 0, 'wrongcore' => 0, 'invalid' => 0);
+  $buffer = array();
+  $bcnt = 0;
+  while ($line = fgets($handle)) {
+    // 127.0.0.1 - - [02/Nov/2009:11:11:44 -0600] "GET http://updates.drupal.org/release-history/drupal/6.x?site_key=ffffffffffffffffffffffffffffffff&version=6.14 HTTP/1.0" 200 119 "-" "Drupal (+http://drupal.org/)"
+    if (preg_match('@\[([^\]]+)\] "GET '. UPDATES_URL .'([^/]+)/([^\?]+)\?([^ ]+) @', $line, $matches)) {
+      // We are cheating on url parsing a bit for speed reasons. We have a lot
+      // of log to get through as fast as possible. None of the "sane" entries will
+      // have stuff like anchors in the url.
+
+      list(, $time, $project, $core, $query) = $matches;
+      $ip = substr($line, 0, strpos($line, " "));
+
+      // This cache is awesome. Hit rate was ~92% on a one-day logfile.
+      // On my laptop, it reduces execution time of the parse loop from 170
+      // seconds to 70 seconds. I feel I will be unable to top that. ~bdragon
+      if ($timestr != $time) {
+        $timestr = $time;
+        $timestamp = strtotime($time); // 28 seconds saved in test run.
+        $week = project_usage_weekly_timestamp($timestamp); // 75 seconds saved(!)
+        // Change collections on a week boundary.
+        if ($ccol != $week) {
+          echo "Week change.\n";
+          $ccol = $week;
+          if ($bcnt > 0) {
+            // We need to process the buffer before switching weeks.
+            project_usage_process_batch($c, $buffer, $st);
+            $bcnt = 0;
+            $buffer = array();
+          }
+          $c = $db->selectCollection("$week");
+          $c->ensureIndex('site_key', TRUE);
+        }
+      }
+
+      $qdata = array();
+      foreach (explode('&', $query) as $part) {
+        list($k, $v) = explode('=', $part, 2);
+        $qdata[$k] = urldecode($v);
+      }
+
+
+      if (!empty($qdata['site_key']) && !empty($qdata['version'])) {
+        $rkey = strtolower($project .':'. $qdata['version']);
+        if (isset($releasedata[$rkey]) && isset($rterms[$core]) && ($rterms[$core] == $releasedata[$rkey][2])) {
+          $rd = $releasedata[$rkey]; // array($row->pid, $row->nid, $row->version_api_tid);
+          $entry = array(
+            'site_key' => strtolower($qdata['site_key']),
+            'ip' => $ip,
+            'project' => $rd[0],
+            'core' => $rd[2],
+            'timestamp' => $timestamp,
+            'release' => $rd[1],
+          );
+
+          $bcnt++;
+          $buffer[$qdata['site_key']][] = $entry;
+
+          // Tune the 32768 as necessary. Sites checking in will generally leave their hits
+          // in the logfile in close proximity. We can often aggregate several of the hits
+          // from a site into a single update, if they appear in the same "chunk".
+          // This is purely for efficiency reasons, using the "wrong" number will at worst
+          // slow down statistics processing. 32768 was found to do reasonably well
+          // for updates.drupal.org.
+          if ($bcnt >= 32768) {
+            project_usage_process_batch($c, $buffer, $st);
+            $bcnt = 0;
+            $buffer = array();
+          }
+        }
+        else {
+          $st['wrongcore']++;
+        }
+      }
+      else {
+        $st['invalid']++;
+      }
+    }
+  }
+  if ($bcnt > 0) {
+    // Process the remaining partial buffer.
+    project_usage_process_batch($c, $buffer, $st);
+  }
+
+  fclose($handle);
+  $time2 = microtime(2);
+
+  echo "Processed file in ". ($time2 - $time1) ." seconds.\n";
+
+  print_r($st);
+
+  echo "\nEnd processing log file.\n";
+
+}