--- /dev/null 2010-01-28 00:00:21.605006597 -0600 +++ usage/project-usage-common.inc 2010-02-03 13:06:16.000000000 -0600 @@ -0,0 +1,182 @@ + DRUPAL_ROOT, + 'SITE_NAME' => SITE_NAME, + 'UPDATES_URL' => UPDATES_URL, + 'UPDATES_GLOB' => UPDATES_GLOB, + 'UPDATES_LOADER' => UPDATES_LOADER, + 'STATS_EXPIRE_WEEKS' => STATS_EXPIRE_WEEKS, +); +$fatal_err = FALSE; +foreach ($vars as $name => $val) { + if (empty($val)) { + print "ERROR: \"$name\" constant not defined, aborting\n"; + $fatal_err = TRUE; + } +} +if ($fatal_err) { + exit(1); +} + +$script_name = $argv[0]; + +// Setup variables for Drupal bootstrap +$_SERVER['HTTP_HOST'] = SITE_NAME; +$_SERVER['REMOTE_ADDR'] = '127.0.0.1'; +$_SERVER['REQUEST_URI'] = '/' . $script_name; +$_SERVER['SCRIPT_NAME'] = '/' . $script_name; +$_SERVER['PHP_SELF'] = '/' . $script_name; +$_SERVER['SCRIPT_FILENAME'] = $_SERVER['PWD'] .'/'. $script_name; +$_SERVER['PATH_TRANSLATED'] = $_SERVER['SCRIPT_FILENAME']; + +if (!chdir(DRUPAL_ROOT)) { + print "ERROR: Can't chdir(DRUPAL_ROOT), aborting.\n"; + exit(1); +} +// Make sure our umask is sane for generating directories and files. +umask(022); + +require_once 'includes/bootstrap.inc'; + +drupal_bootstrap(DRUPAL_BOOTSTRAP_FULL); +//drupal_bootstrap(DRUPAL_BOOTSTRAP_DATABASE); + +if (!module_exists('project_usage')) { + watchdog('project_usage', 'project_usage module is not active, aborting statistics processing!', array(), WATCHDOG_ERROR); + print t('project_usage module is not active, aborting statistics processing!'); + exit(1); +} + +// Load the API functions we need for manipulating dates and timestamps. +module_load_include('inc', 'project_usage', 'includes/date_api'); + +/////////////////// COMMON FUNCTIONS /////////////////// + +/** + * Get a MongoDB reference. + */ +function project_usage_mongo() { + static $db; + if (!$db) { + $conn = new Mongo(); // @@@ Ask nnewton about enabling unix domain socket. + $db = $conn->selectDB("update-statistics"); + // @@@ Check for success? + } + return $db; +} + +/** + * Get a lookup table. + * Information about cores, projects and releases are loaded once and cached in + * memory for speed. + */ +function project_usage_lookup($type = 'terms') { + static $data = array(); + + if (empty($data)) { + // Terms + $db = project_usage_mongo(); + $c = $db->selectCollection('terms'); + $cursor = $c->find(); + foreach ($cursor as $row) { + $data['terms'][$row['tid']] = $row['name']; + $data['rterms'][$row['name']] = $row['tid']; + } + + // Projects + $c = $db->selectCollection('projects'); + $cursor = $c->find(); + foreach ($cursor as $row) { + $data['projdata'][$row['uri']] = $row['pid']; + $data['rprojdata'][$row['pid']] = $row['uri']; + } + + // Releases + $c = $db->selectCollection('releases'); + $cursor = $c->find(); + foreach ($cursor as $r) { + $row = (object) $r; + $data['releasedata']["{$row->uri}:{$row->version}"] = array($row->pid, $row->nid, $row->tid); + $data['rreleasedata']["{$row->pid}:{$row->nid}"] = $row->version; + } + } + return isset($data[$type]) ? $data[$type] : FALSE; +} + +/** + * Feed a batch of processed data into mongo. + */ +function project_usage_process_batch(&$c, $buffer, &$st) { + // Buffers are broken down by site so we don't have to jump back and forth + // between sites while talking to mongo. We can for the most part do things + // a site at a time. + foreach ($buffer as $site_key => $data) { + $lasthit = $data[count($data)-1]; + $changed = FALSE; + $record = $c->findOne(array('site_key' => $site_key)); + if (!$record) { + $record = array( + 'site_key' => $site_key, + 'modules' => array(), + ); + $changed = TRUE; + } + + // IP accounting is somewhat minimal, and is used to allow cleaning up + // after a remote site calls in repeatedly with different site_keys. + // This is pretty minimal -- it doesn't check for multiple ips, and only + // updates with the rest of the record. + $record['ip'] = $lasthit['ip']; + + // Core also needs to be tracked in a similar fashion for determining the + // "correct" core for end-of-week. + $record['core'] = $lasthit['core']; + + foreach ($data as $entry) { + if (!isset($record['modules'][$entry['project']])) { + // If it's the first time this week, just add it. + $record['modules'][$entry['project']] = array( + 'timestamp' => $entry['timestamp'], + 'release' => $entry['release'], + 'core' => $entry['core'], + ); + $changed = TRUE; + } + else { + // If the entry on file is older than the incoming one... + if ($record['modules'][$entry['project']]['timestamp'] < $entry['timestamp']) { + // Change it. + $record['modules'][$entry['project']] = array( + 'timestamp' => $entry['timestamp'], + 'release' => $entry['release'], + 'core' => $entry['core'], + ); + $changed = TRUE; + } + } + } + if ($changed) { + $c->save($record); + $st['change']++; + } + else { + $st['nochange']++; + } + } +} + --- /dev/null 2010-01-28 00:00:21.605006597 -0600 +++ usage/project-usage-config.inc 2010-02-03 13:55:07.000000000 -0600 @@ -0,0 +1,30 @@ + 0, 'nochange' => 0, 'wrongcore' => 0, 'invalid' => 0, 'noproject' => 0, 'noversion' => 0, 'nullversion' => 0); + $buffer = array(); + $bcnt = 0; + while ($line = fgets($handle)) { + // timestamp site_key pid nid tid ip_addr + // INSERT INTO project_usage_day VALUES (1255824000,'00000000000000000000000000000000',106016,290380,87,' 10.111.222.77'); + if (preg_match('@VALUES \(([^\)]+)\)@', $line, $matches)) { + $qdata = array(); + list($time, $qdata['site_key'], $tproject, $treleasenid, $tcore, $ip) = explode(',', $matches[1]); + + $qdata['site_key'] = trim($qdata['site_key'], "'"); + $ip = trim($ip, "' "); + + $project = $rprojdata[(int)$tproject]; + if (!$project) { + $st['noproject']++; + } + $qdata['version'] = $rreleasedata["{$tproject}:{$treleasenid}"]; + if (!$qdata['version']) { + if ($treleasenid == 0) { + $st['nullversion']++; + } + else { + $st['noversion']++; + } + } + + // We are cheating on url parsing a bit for speed reasons. We have a lot + // of log to get through as fast as possible. None of the "sane" entries will + // have stuff like anchors in the url. + + // This cache is awesome. Hit rate was ~92% on a one-day logfile. + // On my laptop, it reduces execution time of the parse loop from 170 + // seconds to 70 seconds. I feel I will be unable to top that. ~bdragon + if ($timestr != $time) { + $timestr = $time; + $timestamp = $time; // 28 seconds saved in test run. + $week = project_usage_weekly_timestamp($timestamp); // 75 seconds saved(!) + // Change collections on a week boundary. + if ($ccol != $week) { + echo "Week change.\n"; + $ccol = $week; + if ($bcnt > 0) { + // We need to process the buffer before switching weeks. + project_usage_process_batch($c, $buffer, $st); + $bcnt = 0; + $buffer = array(); + } + $c = $db->selectCollection("$week"); + $c->ensureIndex('site_key', TRUE); + } + } + + if (!empty($qdata['site_key']) && !empty($qdata['version'])) { + $rkey = strtolower($project .':'. $qdata['version']); + if (isset($releasedata[$rkey]) && ($tcore == $releasedata[$rkey][2])) { + $rd = $releasedata[$rkey]; // array($row->pid, $row->nid, $row->version_api_tid); + $entry = array( + 'site_key' => strtolower($qdata['site_key']), + 'ip' => $ip, + 'project' => $rd[0], + 'core' => $rd[2], + 'timestamp' => $timestamp, + 'release' => $rd[1], + ); + + $bcnt++; + $buffer[$qdata['site_key']][] = $entry; + + // Tune the 32768 as necessary. Sites checking in will generally leave their hits + // in the logfile in close proximity. We can often aggregate several of the hits + // from a site into a single update, if they appear in the same "chunk". + // This is purely for efficiency reasons, using the "wrong" number will at worst + // slow down statistics processing. 32768 was found to do reasonably well + // for updates.drupal.org. + if ($bcnt >= 32768) { + project_usage_process_batch($c, $buffer, $st); + $bcnt = 0; + $buffer = array(); + } + } + else { + $st['wrongcore']++; + } + } + else { + $st['invalid']++; + } + } + } + if ($bcnt > 0) { + // Process the remaining partial buffer. + project_usage_process_batch($c, $buffer, $st); + } + + fclose($handle); + $time2 = microtime(2); + + echo "Processed file in ". ($time2 - $time1) ." seconds.\n"; + + print_r($st); + + echo "\nEnd processing log file.\n"; + +} --- /dev/null 2010-01-28 00:00:21.605006597 -0600 +++ usage/project-usage-load-varnish.php 2010-02-03 13:15:44.000000000 -0600 @@ -0,0 +1,125 @@ + 0, 'nochange' => 0, 'wrongcore' => 0, 'invalid' => 0); + $buffer = array(); + $bcnt = 0; + while ($line = fgets($handle)) { + // 127.0.0.1 - - [02/Nov/2009:11:11:44 -0600] "GET http://updates.drupal.org/release-history/drupal/6.x?site_key=ffffffffffffffffffffffffffffffff&version=6.14 HTTP/1.0" 200 119 "-" "Drupal (+http://drupal.org/)" + if (preg_match('@\[([^\]]+)\] "GET '. UPDATES_URL .'([^/]+)/([^\?]+)\?([^ ]+) @', $line, $matches)) { + // We are cheating on url parsing a bit for speed reasons. We have a lot + // of log to get through as fast as possible. None of the "sane" entries will + // have stuff like anchors in the url. + + list(, $time, $project, $core, $query) = $matches; + $ip = substr($line, 0, strpos($line, " ")); + + // This cache is awesome. Hit rate was ~92% on a one-day logfile. + // On my laptop, it reduces execution time of the parse loop from 170 + // seconds to 70 seconds. I feel I will be unable to top that. ~bdragon + if ($timestr != $time) { + $timestr = $time; + $timestamp = strtotime($time); // 28 seconds saved in test run. + $week = project_usage_weekly_timestamp($timestamp); // 75 seconds saved(!) + // Change collections on a week boundary. + if ($ccol != $week) { + echo "Week change.\n"; + $ccol = $week; + if ($bcnt > 0) { + // We need to process the buffer before switching weeks. + project_usage_process_batch($c, $buffer, $st); + $bcnt = 0; + $buffer = array(); + } + $c = $db->selectCollection("$week"); + $c->ensureIndex('site_key', TRUE); + } + } + + $qdata = array(); + foreach (explode('&', $query) as $part) { + list($k, $v) = explode('=', $part, 2); + $qdata[$k] = urldecode($v); + } + + + if (!empty($qdata['site_key']) && !empty($qdata['version'])) { + $rkey = strtolower($project .':'. $qdata['version']); + if (isset($releasedata[$rkey]) && isset($rterms[$core]) && ($rterms[$core] == $releasedata[$rkey][2])) { + $rd = $releasedata[$rkey]; // array($row->pid, $row->nid, $row->version_api_tid); + $entry = array( + 'site_key' => strtolower($qdata['site_key']), + 'ip' => $ip, + 'project' => $rd[0], + 'core' => $rd[2], + 'timestamp' => $timestamp, + 'release' => $rd[1], + ); + + $bcnt++; + $buffer[$qdata['site_key']][] = $entry; + + // Tune the 32768 as necessary. Sites checking in will generally leave their hits + // in the logfile in close proximity. We can often aggregate several of the hits + // from a site into a single update, if they appear in the same "chunk". + // This is purely for efficiency reasons, using the "wrong" number will at worst + // slow down statistics processing. 32768 was found to do reasonably well + // for updates.drupal.org. + if ($bcnt >= 32768) { + project_usage_process_batch($c, $buffer, $st); + $bcnt = 0; + $buffer = array(); + } + } + else { + $st['wrongcore']++; + } + } + else { + $st['invalid']++; + } + } + } + if ($bcnt > 0) { + // Process the remaining partial buffer. + project_usage_process_batch($c, $buffer, $st); + } + + fclose($handle); + $time2 = microtime(2); + + echo "Processed file in ". ($time2 - $time1) ." seconds.\n"; + + print_r($st); + + echo "\nEnd processing log file.\n"; + +}