Index: modules/aggregator/aggregator.admin.inc
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.admin.inc,v
retrieving revision 1.14
diff -u -p -r1.14 aggregator.admin.inc
--- modules/aggregator/aggregator.admin.inc	16 Aug 2008 14:48:17 -0000	1.14
+++ modules/aggregator/aggregator.admin.inc	20 Aug 2008 11:40:56 -0000
@@ -393,33 +393,43 @@ function aggregator_admin_refresh_feed($
  * @see system_settings_form()
  */
 function aggregator_admin_settings() {
-  $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items');
-  $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval');
-
-  $form['aggregator_allowed_html_tags'] = array(
-    '#type' => 'textfield', '#title' => t('Allowed HTML tags'), '#size' => 80, '#maxlength' => 255,
-    '#default_value' => variable_get('aggregator_allowed_html_tags', '<a> <b> <br> <dd> <dl> <dt> <em> <i> <li> <ol> <p> <strong> <u> <ul>'),
-    '#description' => t('A space-separated list of HTML tags allowed in the content of feed items. (Tags in this list are not removed by Drupal.)'),
-  );
-
-  $form['aggregator_summary_items'] = array(
-    '#type' => 'select', '#title' => t('Items shown in sources and categories pages') ,
-    '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items,
-    '#description' => t('Number of feed items displayed in feed and category summary pages.'),
-  );
-
-  $form['aggregator_clear'] = array(
-    '#type' => 'select', '#title' => t('Discard items older than'),
-    '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period,
-    '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured <a href="@cron">cron maintenance task</a>.)', array('@cron' => url('admin/reports/status'))),
-  );
-
-  $form['aggregator_category_selector'] = array(
-    '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'),
-    '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')),
-    '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector work well with large numbers of categories.)'),
+  $period = array('-1' => t('none'));
+  $period += drupal_map_assoc(array(900, 1800, 3600, 7200, 10800, 21600, 32400, 43200, 64800, 86400, 172800, 259200, 604800, 1209600, 2419200), 'format_interval');
+  $parsers = module_implements('aggregator_parse');
+  foreach ($parsers as $k => $v) {
+    $info = module_invoke($v, 'aggregator_parse', 'info');
+    unset($parsers[$k]);
+    $parsers[$v] = $info['title'] . ' <span class="description">' . $info['description'] .'</span>';
+  }
+  $processors = module_implements('aggregator_process');
+  foreach ($processors as $k => $v) {
+    $info = module_invoke($v, 'aggregator_process', 'info');
+    unset($processors[$k]);
+    $processors[$v] = $info['title'] . ' <span class="description">' . $info['description'] .'</span>';
+  }
+  $form['aggregator_parser'] = array(
+    '#type' => 'radios',
+    '#title' => t('Parser'),
+    '#description' => t('Parsers retrieve and parse feed data. Choose one suitable for the type of feeds you would like to aggregate.'),
+    '#options' => $parsers,
+    '#default_value' => variable_get('aggregator_parser', array_pop(array_flip($parsers))),
+  );
+  $form['aggregator_processors'] = array(
+    '#type' => 'checkboxes',
+    '#title' => t('Processors'),
+    '#description' => t('Processors act on parsed feed data, for example they store feed items. Pick the processors suitable for your task.'),
+    '#options' => $processors,
+    '#default_value' => variable_get('aggregator_processor', array_slice(array_flip($processors), 0, 1)),
+  );
+  $form['aggregator_refresh'] = array(
+    '#type' => 'select',
+    '#title' => t('Update interval'),
+    '#default_value' => variable_get('aggregator_refresh', 3600),
+    '#options' => $period,
+    '#description' => t('Approximate time between checking feeds. Requires a correctly configured <a href="@cron">cron maintenance task</a>.', array('@cron' => url('admin/reports/status'))),
   );
-
+  $form['modules'] = array();
+  
   return system_settings_form($form);
 }
 
@@ -511,3 +521,51 @@ function aggregator_form_category_submit
     drupal_set_message(t('The category %category has been added.', array('%category' => $form_state['values']['title'])));
   }
 }
+
+/**
+ * Implementation of hook_form_alter().
+ */
+function aggregator_form_alter(&$form, $form_state, $form_id) {
+  if ($form_id == 'aggregator_admin_settings') {
+    if (aggregator_is_enabled('aggregator')) {
+      $types = node_get_types();
+      $types_select = array();
+      foreach ($types as $type) {
+        // Do not allow a content-type for both the items and the feeds
+        if (!variable_get('aggregator_feed_' . $type->type, FALSE)) {
+          $types_select[$type->type] = $type->name;
+        }
+      }
+      $info = module_invoke('aggregator', 'aggregator_process', 'info');
+      $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval');
+      $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items');
+      
+      $form['modules']['aggregator'] = array(
+        '#type' => 'fieldset',
+        '#title' => t('Advanced Aggregator Light settings'),
+        '#description' => $info['description'],
+        '#collapsible' => TRUE,
+        '#collapsed' => !aggregator_is_enabled('aggregator', $type),
+      );
+      
+
+      $form['modules']['aggregator']['aggregator_summary_items'] = array(
+    '#type' => 'select', '#title' => t('Items shown in sources and categories pages') ,
+    '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items,
+    '#description' => t('Number of feed items displayed in feed and category summary pages.'),
+  );
+
+  $form['modules']['aggregator']['aggregator_clear'] = array(
+    '#type' => 'select', '#title' => t('Discard items older than'),
+    '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period,
+    '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured <a href="@cron">cron maintenance task</a>.)', array('@cron' => url('admin/reports/status'))),
+  );
+
+  $form['modules']['aggregator']['aggregator_category_selector'] = array(
+    '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'),
+    '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')),
+    '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector work well with large numbers of categories.)'),
+  );
+    }
+  }
+}
\ No newline at end of file
Index: modules/aggregator/aggregator.info
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.info,v
retrieving revision 1.7
diff -u -p -r1.7 aggregator.info
--- modules/aggregator/aggregator.info	15 May 2008 21:27:32 -0000	1.7
+++ modules/aggregator/aggregator.info	20 Aug 2008 11:40:56 -0000
@@ -8,3 +8,5 @@ core = 7.x
 files[] = aggregator.module
 files[] = aggregator.admin.inc
 files[] = aggregator.pages.inc
+files[] = aggregator.parser.inc
+
Index: modules/aggregator/aggregator.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v
retrieving revision 1.389
diff -u -p -r1.389 aggregator.module
--- modules/aggregator/aggregator.module	16 Aug 2008 14:48:17 -0000	1.389
+++ modules/aggregator/aggregator.module	20 Aug 2008 11:40:58 -0000
@@ -281,9 +281,23 @@ function aggregator_perm() {
  * Checks news feeds for updates once their refresh interval has elapsed.
  */
 function aggregator_cron() {
-  $result = db_query('SELECT * FROM {aggregator_feed} WHERE checked + refresh < %d', time());
-  while ($feed = db_fetch_array($result)) {
-    aggregator_refresh($feed);
+  $ready = FALSE;
+  if (drupal_function_exists('_aggregator_light_delete_expired')) {
+    _aggregator_light_delete_expired();
+  }
+  // Query the feeds which should be refreshed and do the refresh.
+  $start = time();
+  while(_aggregator_cron_time() || !$ready) {
+    $sql = "SELECT * FROM {aggregator_feed} WHERE checked <= %d AND (%d - checked) < %d ORDER BY checked";
+    $result = db_query_range($sql, $start, variable_get('aggregator_refresh', 3600), 0, 2);
+    $feed_count = 0;
+    while ($feed = db_fetch_array($result)) {
+      aggregator_refresh($feed);
+      ++$feed_count;
+    }
+    if ($feed_count == 0) {
+      $ready = TRUE;
+    }
   }
 }
 
@@ -359,6 +373,136 @@ function aggregator_block($op = 'list', 
 }
 
 /**
+ * Implementation of hook_aggregator_process().
+ * 
+ * @param $op
+ *   'save' The feed items should be updated or saved.
+ *   'info' Metadata about the processor
+ * @param $channel
+ *   The data
+ */
+function aggregator_aggregator_process($op, $feed = NULL) {
+  switch ($op) {
+    case 'save':
+      $new = FALSE;
+      foreach ($feed['items'] as $k => $item) {
+        $new = ($new || is_numeric($item['unique']['aggregator']) ? FALSE : TRUE);
+        aggregator_save_item(array('iid' => $item['unique']['aggregator'], 'fid' => $feed['fid'], 'timestamp' => $item['timestamp'], 'title' => $item['title'], 'link' => $item['link'], 'author' => $item['author'], 'description' => $item['description'], 'guid' => $item['guid']));
+      }
+      if ($new) {
+        watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title']));
+        drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
+      }
+      return $feed;
+      break;
+    case 'unique':
+      foreach ($feed['items'] as $k => $item) {
+        if (!empty($item['guid'])) {
+          $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $item['guid']));
+        }
+        else if ($item['link'] && $item['link'] != $feed['link'] && $item['link'] != $feed['url']) {
+          $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $item['link']));
+        }
+        else {
+          $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $item['title']));
+        }
+        $feed['items'][$k]['unique'] = array();
+        $feed['items'][$k]['unique']['aggregator'] = (!isset($entry->iid) ? TRUE : $entry->iid);
+      }
+      return $feed;
+      break;
+    case 'info':
+      return array(
+        'title' => t('Aggregator Light'),
+        'description' => t('Creates lightweight records of feed items.'),
+      );
+  }
+}
+
+/**
+ * Implementation of hook_parse().
+ * 
+ * @param $op
+ *   'parse' Parse the feed-nodes
+ *   'info' Metadata about the processor
+ * @param $data
+ *   Raw downloaded data
+ */
+function aggregator_aggregator_parse($op, $feed = NULL) {
+  switch ($op) {
+    case 'parse':
+      // Generate conditional GET headers.
+      $headers = array();
+      if ($feed['etag']) {
+        $headers['If-None-Match'] = $feed['etag'];
+      }
+      if ($feed['modified']) {
+        $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT';
+      }
+
+      // Request feed.
+      $result = drupal_http_request($feed['url'], $headers);
+
+      // Process HTTP response code.
+      switch ($result->code) {
+        case 304:
+          return TRUE;
+          break;
+        case 301:
+          $feed['url'] = $result->redirect_url;
+          
+          if (isset($result->redirect_url)) {
+            watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url']));
+          }
+          // Do not break here.
+        case 200:
+        case 302:
+        case 307:
+          // We store the md5 hash of feed data in the database. When refreshing a
+          // feed we compare stored hash and new hash calculated from downloaded
+          // data. If both are equal we say that feed is not updated.
+          $feed['md5'] = md5($result->data);
+          if ($feed['hash'] == $feed['md5']) {
+            return TRUE;
+          }
+          @$data = simplexml_load_string($result->data);
+          if (drupal_function_exists('aggregator_parser_format_detect')) {
+            $format = aggregator_parser_format_detect($data);
+            if ($format == FALSE) {
+              $result = FALSE;
+            }
+            $feed_handler = 'aggregator_parser_' . $format;
+            if (drupal_function_exists($feed_handler)) {
+              $parser_out = $feed_handler($data);
+            }
+          }
+          return array_merge(
+            array(
+              'md5' => $feed['md5'],
+              'modified' => empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']),
+              'etag' => empty($result->headers['ETag']) ? '' : $result->headers['ETag'],
+              'author' => '',
+            ),
+            $parser_out
+          );
+          break;
+        default:
+          watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING);
+          drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error)));
+          module_invoke('system', 'check_http_request');
+          return FALSE;
+      }
+      break;
+    case 'info':
+      return array(
+        'title' => t('Built-in Parser'),
+        'description' => t('Default parser for RSS, Atom and RDF feeds.'),
+      );
+      
+  }
+}
+
+/**
  * Add/edit/delete aggregator categories.
  *
  * @param $edit
@@ -406,16 +550,11 @@ function aggregator_save_feed($edit) {
     db_query("UPDATE {aggregator_feed} SET title = '%s', url = '%s', refresh = %d, block = %d WHERE fid = %d", $edit['title'], $edit['url'], $edit['refresh'], $edit['block'], $edit['fid']);
   }
   elseif (!empty($edit['fid'])) {
-    $items = array();
-    $result = db_query('SELECT iid FROM {aggregator_item} WHERE fid = %d', $edit['fid']);
-    while ($item = db_fetch_object($result)) {
-      $items[] = "iid = $item->iid";
-    }
-    if (!empty($items)) {
-      db_query('DELETE FROM {aggregator_category_item} WHERE ' . implode(' OR ', $items));
+    $processors = variable_get('aggregator_processor', array());
+    foreach ($processors as $processor) {
+      module_invoke($processor, 'aggregator_process', 'delete', $edit);
     }
     db_query('DELETE FROM {aggregator_feed} WHERE fid = %d', $edit['fid']);
-    db_query('DELETE FROM {aggregator_item} WHERE fid = %d', $edit['fid']);
     // Make sure there is no active block for this feed.
     db_query("DELETE FROM {blocks} WHERE module = '%s' AND delta = '%s'", 'aggregator', 'feed-' . $edit['fid']);
   }
@@ -456,203 +595,64 @@ function aggregator_remove($feed) {
 }
 
 /**
- * Callback function used by the XML parser.
- */
-function aggregator_element_start($parser, $name, $attributes) {
-  global $item, $element, $tag, $items, $channel;
-
-  switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
-    case 'CONTENT':
-    case 'SUMMARY':
-    case 'TAGLINE':
-    case 'SUBTITLE':
-    case 'LOGO':
-    case 'INFO':
-      $element = $name;
-      break;
-    case 'ID':
-      if ($element != 'ITEM') {
-        $element = $name;
-      }
-    case 'LINK':
-      if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') {
-        if ($element == 'ITEM') {
-          $items[$item]['LINK'] = $attributes['HREF'];
-        }
-        else {
-          $channel['LINK'] = $attributes['HREF'];
-        }
-      }
-      break;
-    case 'ITEM':
-      $element = $name;
-      $item += 1;
-      break;
-    case 'ENTRY':
-      $element = 'ITEM';
-      $item += 1;
-      break;
-  }
-
-  $tag = $name;
-}
-
-/**
- * Call-back function used by the XML parser.
- */
-function aggregator_element_end($parser, $name) {
-  global $element;
-
-  switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
-    case 'ITEM':
-    case 'ENTRY':
-    case 'CONTENT':
-    case 'INFO':
-      $element = '';
-      break;
-    case 'ID':
-      if ($element == 'ID') {
-        $element = '';
-      }
-  }
-}
-
-/**
- * Callback function used by the XML parser.
- */
-function aggregator_element_data($parser, $data) {
-  global $channel, $element, $items, $item, $image, $tag;
-  $items += array($item => array());
-  switch ($element) {
-    case 'ITEM':
-      $items[$item] += array($tag => '');
-      $items[$item][$tag] .= $data;
-      break;
-    case 'IMAGE':
-    case 'LOGO':
-      $image += array($tag => '');
-      $image[$tag] .= $data;
-      break;
-    case 'LINK':
-      if ($data) {
-        $items[$item] += array($tag => '');
-        $items[$item][$tag] .= $data;
-      }
-      break;
-    case 'CONTENT':
-      $items[$item] += array('CONTENT' => '');
-      $items[$item]['CONTENT'] .= $data;
-      break;
-    case 'SUMMARY':
-      $items[$item] += array('SUMMARY' => '');
-      $items[$item]['SUMMARY'] .= $data;
-      break;
-    case 'TAGLINE':
-    case 'SUBTITLE':
-      $channel += array('DESCRIPTION' => '');
-      $channel['DESCRIPTION'] .= $data;
-      break;
-    case 'INFO':
-    case 'ID':
-    case 'TEXTINPUT':
-      // The sub-element is not supported. However, we must recognize
-      // it or its contents will end up in the item array.
-      break;
-    default:
-      $channel += array($tag => '');
-      $channel[$tag] .= $data;
-  }
-}
-
-/**
  * Checks a news feed for new items.
  *
  * @param $feed
  *   An associative array describing the feed to be refreshed.
  */
 function aggregator_refresh($feed) {
-  global $channel, $image;
 
-  // Generate conditional GET headers.
-  $headers = array();
-  if ($feed['etag']) {
-    $headers['If-None-Match'] = $feed['etag'];
+  $parser = variable_get('aggregator_parser', 'aggregator');
+  $channel = module_invoke($parser, 'aggregator_parse', 'parse', $feed);
+  if ($channel === TRUE) {
+    db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
+    drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
+    return;
   }
-  if ($feed['modified']) {
-    $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT';
+  if ($channel === FALSE) {
+    return;
   }
+  $channel = array_merge($channel, $feed);
+  if (is_array($channel)) {
+    $processors = variable_get('aggregator_processor', array());
+    foreach ($processors as $processor) {
+      $channel = module_invoke($processor, 'aggregator_process', 'unique', $channel);
+    }
+    foreach ($processors as $processor) {
+      $channel = module_invoke($processor, 'aggregator_process', 'save', $channel);
+    }
 
-  // Request feed.
-  $result = drupal_http_request($feed['url'], $headers);
-
-  // Process HTTP response code.
-  switch ($result->code) {
-    case 304:
-      db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
-      drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
-      break;
-    case 301:
-      $feed['url'] = $result->redirect_url;
-      // Do not break here.
-    case 200:
-    case 302:
-    case 307:
-      // We store the md5 hash of feed data in the database. When refreshing a
-      // feed we compare stored hash and new hash calculated from downloaded
-      // data. If both are equal we say that feed is not updated.
-      $md5 = md5($result->data);
-      if ($feed['hash'] == $md5) {
-        db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
-        drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
-        break;
-      }
-
-      // Filter the input data.
-      if (aggregator_parse_feed($result->data, $feed)) {
-        $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']);
-
-        // Prepare the channel data.
-        foreach ($channel as $key => $value) {
-          $channel[$key] = trim($value);
-        }
-
-        // Prepare the image data (if any).
-        foreach ($image as $key => $value) {
-          $image[$key] = trim($value);
-        }
+    $image = $channel['image'];
 
-        if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) {
-          // TODO: we should really use theme_image() here, but that only works with
-          // local images. It won't work with images fetched with a URL unless PHP version > 5.
-          $image = '<a href="' . check_url($image['LINK']) . '" class="feed-image"><img src="' . check_url($image['URL']) . '" alt="' . check_plain($image['TITLE']) . '" /></a>';
-        }
-        else {
-          $image = NULL;
-        }
+    // Prepare the channel data.
+    foreach ($channel as $key => $value) {
+      if (!is_array($value)) {
+        $channel[$key] = trim($value);
+      }
+    }
 
-        $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag'];
-        // Update the feed data.
-        db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $md5, $etag, $modified, $feed['fid']);
+    // Prepare the image data (if any).
+    if (is_array($image)) {
+      foreach ($image as $key => $value) {
+        $image[$key] = trim($value);
+      }
+    }
 
-        // Clear the cache.
-        cache_clear_all();
+    if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) {
+      // TODO: we should really use theme_image() here, but that only works with
+      // local images. It won't work with images fetched with a URL unless PHP version > 5.
+      $image = '<a href="' . check_url($image['LINK']) . '" class="feed-image"><img src="' . check_url($image['URL']) . '" alt="' . check_plain($image['TITLE']) . '" /></a>';
+    }
+    else {
+      $image = NULL;
+    }
 
-        if (isset($result->redirect_url)) {
-          watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url']));
-        }
+    // Update the feed data.
+    db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['link'], $channel['description'], $image, $channel['md5'], $channel['etag'], $channel['modified'], $feed['fid']);
 
-        watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title']));
-        drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
-      }
-      break;
-    default:
-      watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING);
-      drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error)));
-      module_invoke('system', 'check_http_request');
+    // Clear the cache.
+    cache_clear_all();
+    
   }
 }
 
@@ -697,148 +697,16 @@ function aggregator_parse_w3cdtf($date_s
 }
 
 /**
- * Parse a feed and store its items.
- *
- * @param $data
- *   The feed data.
- * @param $feed
- *   An associative array describing the feed to be parsed.
- * @return
- *   FALSE on error, TRUE otherwise.
- */
-function aggregator_parse_feed(&$data, $feed) {
-  global $items, $image, $channel;
-
-  // Unset the global variables before we use them.
-  unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
-  $items = array();
-  $image = array();
-  $channel = array();
-
-  // Parse the data.
-  $xml_parser = drupal_xml_parser_create($data);
-  xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
-  xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
-
-  if (!xml_parse($xml_parser, $data, 1)) {
-    watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
-    drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
-    return FALSE;
-  }
-  xml_parser_free($xml_parser);
-
-  // We reverse the array such that we store the first item last, and the last
-  // item first. In the database, the newest item should be at the top.
-  $items = array_reverse($items);
-
-  // Initialize variables.
-  $title = $link = $author = $description = $guid = NULL;
-  foreach ($items as $item) {
-    unset($title, $link, $author, $description, $guid);
-
-    // Prepare the item:
-    foreach ($item as $key => $value) {
-      $item[$key] = trim($value);
-    }
-
-    // Resolve the item's title. If no title is found, we use up to 40
-    // characters of the description ending at a word boundary, but not
-    // splitting potential entities.
-    if (!empty($item['TITLE'])) {
-      $title = $item['TITLE'];
-    }
-    elseif (!empty($item['DESCRIPTION'])) {
-      $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40));
-    }
-    else {
-      $title = '';
-    }
-
-    // Resolve the items link.
-    if (!empty($item['LINK'])) {
-      $link = $item['LINK'];
-    }
-    else {
-      $link = $feed['link'];
-    }
-    $guid = isset($item['GUID']) ? $item['GUID'] : '';
-
-    // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag.
-    if (!empty($item['CONTENT:ENCODED'])) {
-      $item['DESCRIPTION'] = $item['CONTENT:ENCODED'];
-    }
-    elseif (!empty($item['SUMMARY'])) {
-      $item['DESCRIPTION'] = $item['SUMMARY'];
-    }
-    elseif (!empty($item['CONTENT'])) {
-      $item['DESCRIPTION'] = $item['CONTENT'];
-    }
-
-    // Try to resolve and parse the item's publication date. If no date is
-    // found, use the current date instead.
-    $date = 'now';
-    foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) {
-      if (!empty($item[$key])) {
-        $date = $item[$key];
-        break;
-      }
-    }
-
-    $timestamp = strtotime($date); // As of PHP 5.1.0, strtotime returns FALSE on failure instead of -1.
-
-    if ($timestamp <= 0) {
-      $timestamp = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
-      if (!$timestamp) {
-        // Better than nothing.
-        $timestamp = time();
-      }
-    }
-
-    // Save this item. Try to avoid duplicate entries as much as possible. If
-    // we find a duplicate entry, we resolve it and pass along its ID is such
-    // that we can update it if needed.
-    if (!empty($guid)) {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $guid));
-    }
-    else if ($link && $link != $feed['link'] && $link != $feed['url']) {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link));
-    }
-    else {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title));
-    }
-    $item += array('AUTHOR' => '', 'DESCRIPTION' => '');
-    aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid:  ''), 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $guid));
-  }
-
-  // Remove all items that are older than flush item timer.
-  $age = time() - variable_get('aggregator_clear', 9676800);
-  $result = db_query('SELECT iid FROM {aggregator_item} WHERE fid = %d AND timestamp < %d', $feed['fid'], $age);
-
-  $items = array();
-  $num_rows = FALSE;
-  while ($item = db_fetch_object($result)) {
-    $items[] = $item->iid;
-    $num_rows = TRUE;
-  }
-  if ($num_rows) {
-    db_query('DELETE FROM {aggregator_category_item} WHERE iid IN (' . implode(', ', $items) . ')');
-    db_query('DELETE FROM {aggregator_item} WHERE fid = %d AND timestamp < %d', $feed['fid'], $age);
-  }
-
-  return TRUE;
-}
-
-/**
  * Add/edit/delete an aggregator item.
  *
  * @param $edit
  *   An associative array describing the item to be added/edited/deleted.
  */
 function aggregator_save_item($edit) {
-  if ($edit['iid'] && $edit['title']) {
+  if (is_numeric($edit['iid']) && $edit['title']) {
     db_query("UPDATE {aggregator_item} SET title = '%s', link = '%s', author = '%s', description = '%s', guid = '%s', timestamp = %d WHERE iid = %d", $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['guid'], $edit['timestamp'], $edit['iid']);
   }
-  elseif ($edit['iid']) {
+  elseif (is_numeric($edit['iid'])) {
     db_query('DELETE FROM {aggregator_item} WHERE iid = %d', $edit['iid']);
     db_query('DELETE FROM {aggregator_category_item} WHERE iid = %d', $edit['iid']);
   }
@@ -871,6 +739,25 @@ function aggregator_feed_load($fid) {
 }
 
 /**
+ * Tells if the given module is enabled (as parser or as processor).
+ * 
+ * @param $module
+ *   The name of the module.
+ * @return
+ *   TRUE if enabled, FALSE if disabled.
+ */
+function aggregator_is_enabled($module) {
+  if ($module == variable_get('aggregator_parser', '')) {
+    return TRUE;
+  }
+  $processors = array_values(variable_get('aggregator_processors', array()));
+  if (in_array($module, $processors, TRUE)) {
+    return TRUE;
+  }
+  return FALSE;
+}
+
+/**
  * Load an aggregator category.
  *
  * @param $cid
@@ -937,3 +824,17 @@ function aggregator_filter_xss($value) {
 function _aggregator_items($count) {
   return format_plural($count, '1 item', '@count items');
 }
+
+/**
+ * Checks for time limits in cron processing.
+ */
+function _aggregator_cron_time() {
+  static $time_limit;
+  $execute_percentage = 0.5;
+  if (!$time_limit) {
+    $time_limit = time() + ($execute_percentage / 100) * ini_get('max_execution_time');
+    // However, check for left time, maybe some other cron processing already occured.
+    $time_limit = min($time_limit, variable_get('cron_semaphore', 0) + ini_get('max_execution_time'));
+  }
+  return max($time_limit - time(), 0);
+}
Index: modules/aggregator/aggregator.parser.inc
===================================================================
RCS file: modules/aggregator/aggregator.parser.inc
diff -N modules/aggregator/aggregator.parser.inc
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ modules/aggregator/aggregator.parser.inc	20 Aug 2008 11:40:58 -0000
@@ -0,0 +1,271 @@
+<?php
+// $Id$
+
+/**
+ * @file
+ *   Various helper functions for feed parsing
+ */
+
+/**
+ * Detects a feed's format.
+ */
+function aggregator_parser_format_detect($data) {
+  if (is_object($data)) {
+    $attr = $data->attributes();
+    $type = strtolower($data->getName());
+    if (isset($data->entry) || $type == "feed") {
+      return "atom";
+    }
+    if ($type == "rdf" && isset($data->channel)) {
+      return "rdf";
+    }
+    if ($type == "rss" && in_array($attr["version"], array('0.91', "0.92", "2.0"))) {
+      return "rss";
+    }
+  }
+  return FALSE;
+}
+
+/**
+ * Parses RSS 2.0, 0.91, 0.92 feeds.
+ */
+function aggregator_parser_rss(SimpleXMLElement $data) {
+  $feed = array();
+  $dc = $data->channel->children('http://purl.org/dc/elements/1.1/');
+  $feed['title'] = _aggregator_parser_choose("{$data->channel->title}", "{$dc->title}");
+  $feed['description'] = _aggregator_parser_choose("{$data->channel->description}", "{$dc->subject}");
+  $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+  $feed['image'] = isset($data->channel->image->url) ? "{$data->channel->image->url}" : '';
+  $feed['items'] = array();
+  $category_splitter = '.';
+  foreach ($data->xpath('//item') as $news) {
+    // Get important namespaces.
+    $content = $news->children('http://purl.org/rss/1.0/modules/content/');
+		$dc = $news->children('http://purl.org/dc/elements/1.1/');
+		$item = array();
+		$item['guid'] = isset($news->guid) ? "{$news->guid}" : NULL;
+		$item['title'] = _aggregator_parser_choose("{$news->title}", "{$dc->title}");
+		$item['description'] = _aggregator_parser_choose("{$news->description}", "{$news->encoded}", "{$content->encoded}", "{$dc->description}");
+		$item['link'] = _aggregator_parser_choose("{$news->link}");
+		$item['timestamp'] = _aggregator_parse_date("{$news->pubDate}");
+		$item['categories'] = array();
+		if (isset($news->category)) {
+			foreach ($news->category as $cat) {
+				if (is_object($cat)) {
+					$item['categories'][] = trim(strip_tags("$cat"));
+				}
+				else {
+					foreach (explode($category_splitter, $cat) as $tag) {
+						$item['categories'][] = $tag;
+					}
+				}
+			}
+		}
+		$item['categories'] = array_unique($item['categories']);
+		$item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+		$item['enclosures'] = aggregator_parser_extract_enclosures($news);
+		$feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Parses Atom 1.0 feeds.
+ */
+function aggregator_parser_atom(SimpleXMLElement $data) {
+  $feed = array();
+  $feed['title'] = isset($data->title) ? "{$data->title}" : "";
+  $feed['description'] = isset($data->subtitle) ? "{$data->subtitle}" : "";
+  $feed['link'] = '';
+  if (count($data->link) > 0) {
+    $link = $data->link;
+    $link = $link->attributes();
+    $feed['link'] = isset($link["href"]) ? "{$link["href"]}" : "";
+  }
+  $feed->items = array();
+  foreach ($data->entry as $news) {
+    $item = array();
+    $item['guid'] = !empty($news->id) ? "{$news->id}" : NULL;
+    
+    $link_element = "{$news->link}";
+    $link_guid = valid_url($item['guid']) ? $item['guid'] : '';
+    $item['link'] = _aggregator_parser_choose($link_element, $link_guid);
+    $item['title'] = "{$news->title}";
+    $body = '';
+    if (!empty($news->content)) {
+      foreach ($news->content->children() as $child)  {
+        $body .= $child->asXML();
+      }
+      $body .= "{$news->content}";
+    }
+    else if (!empty($news->summary)) {
+      foreach ($news->summary->children() as $child)  {
+        $body .= $child->asXML();
+      }
+      $body .= "{$news->summary}";
+    }
+    $item['description'] = $body;
+    $item['timestamp'] = _aggregator_parse_date("{$news->published}");
+    $item['categories'] = array();
+    if (isset($news->category)) {
+			foreach ($news->category as $category)
+				$item['categories'][] = trim(strip_tags("{$category['term']}"));
+		}
+		$item['categories'] = array_unique($item['categories']);
+		$item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+		$item['enclosures'] = aggregator_parser_extract_enclosures($news);
+    $feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Parses RDF feeds.
+ */
+function aggregator_parser_rdf(SimpleXMLElement $data) {
+  $feed = array();
+  $feed['title'] = isset($data->channel->title) ? "{$data->channel->title}" : "";
+  $feed['description'] = isset($data->channel->description) ? "{$data->channel->description}" : "";
+  $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+  $namespaces = $data->getNamespaces(TRUE);
+  // Set category splitter (space is for del.icio.us feed).
+  $category_splitter = ' ';
+  $feed['items'] = array();
+  foreach ($data->item as $news) {
+    // Initialization.
+    $id = $original_url = NULL;
+    $title = $body = '';
+    $categories = array();
+    foreach ($namespaces as $ns_link) {
+      // Get about attribute as guid.
+      foreach ($news->attributes($ns_link) as $name => $value) {
+        if ($name == 'about') {
+          $id = "{$value}";
+        }
+      }
+
+      // Get children for current namespace.
+      if (version_compare(phpversion(), '5.1.2', '<')) {
+        $ns = (array) $news;
+      }
+      else {
+        $ns = (array) $news->children($ns_link);
+      }
+
+      // Title
+      if (!empty($ns['title'])) {
+        $title = "{$ns['title']}";
+      }
+
+      // Description or dc:description
+      if (!empty($ns['description']) && $body == '') {
+        $body = "{$ns['description']}";
+      }
+
+      // Link
+      if (!empty($ns['link'])) {
+        $link = "{$ns['link']}";
+      }
+
+      // content:encoded
+      if (!empty($ns['encoded'])) {
+        $body = "{$ns['encoded']}";
+      }
+      
+      $time_in = (empty($ns['pubDate']) ? (empty($ns['date']) ? '' : "{$ns['date']}")  : "{$ns['pubDate']}");
+      $timestamp = _aggregator_parse_date($time_in);
+
+      // dc:subject
+      if (!empty($ns['subject'])) {
+        // there can be multiple category tags
+        if (is_array($ns['subject'])) {
+          foreach ($ns['subject'] as $cat) {
+            if (is_object($cat)) {
+              $categories[] = trim(strip_tags($cat->asXML()));
+            }
+            else {
+              $categories[] = $cat;
+            }
+          }
+        }
+        else { //or single tag
+          $categories = explode($category_splitter, "{$ns['subject']}");
+        }
+      }
+    }
+    if (empty($original_url) && !empty($id)) {
+      $original_url = $id;
+    }
+    $item = array();
+    $item['title'] = $title;
+    $item['description'] = $body;
+    $item['timestamp'] = $timestamp;
+    $item['link'] = isset($link) ? $link : '';
+    $item['guid'] = $id;
+    $item['categories'] = $categories;
+    $item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+    $item['enclosures'] = aggregator_parser_extract_enclosures($news);
+    $feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Extracts all the namespace-contained information to ->namespaces structure.
+ */
+function aggregator_parser_extract_namespaces(SimpleXMLElement $item, $namespaces) {
+  $result = array();
+  foreach ($namespaces as $prefix => $url) {
+    $ns = (array) $item->children($url);
+    if (!(empty($ns) || empty($prefix))) {
+      $result[$prefix] = $ns;
+    }
+  }
+  return $result;
+}
+
+/**
+ * Extracts all enclosures inside an item.
+ */
+function aggregator_parser_extract_enclosures(SimpleXMLElement $item) {
+  $result = array();
+  @$item = simplexml_load_string($item->asXML());
+  $possible_enclosures = $item->xpath("//enclosure") + $item->xpath("//link[@rel='enclosure']");
+  foreach ($possible_enclosures as $enc) {
+    $add_enc = array();
+    foreach ($enc->attributes() as $k => $v) {
+      $add_enc[$k] = "{$v}";
+    }
+    $result[] = $add_enc;
+  }
+  return $result;
+}
+
+/**
+ * Chooses the first argument which is not empty and return with it.
+ */
+function _aggregator_parser_choose() {
+  $args = func_get_args();
+  foreach ($args as $arg) {
+    if (strlen($arg) > 1) {
+      return $arg;
+    }
+  }
+  return '';
+}
+
+/**
+ * Parses a date comes from a feed.
+ *
+ * @param $date_string
+ *   The date string in various formats.
+ * @return
+ *   The timestamp of the string or the current time if can't be parsed
+ */
+function _aggregator_parse_date($date_str) {
+  $parsed_date = strtotime($date_str);
+  if ($parsed_date === FALSE || $parsed_date == -1) {
+    $parsed_date = aggregator_parse_w3cdtf($date_str);
+  }
+  return $parsed_date === FALSE ? time() : $parsed_date;
+}
