Index: modules/aggregator/aggregator.info
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.info,v
retrieving revision 1.7
diff -u -p -r1.7 aggregator.info
--- modules/aggregator/aggregator.info	15 May 2008 21:27:32 -0000	1.7
+++ modules/aggregator/aggregator.info	20 Aug 2008 20:29:08 -0000
@@ -8,3 +8,4 @@ core = 7.x
 files[] = aggregator.module
 files[] = aggregator.admin.inc
 files[] = aggregator.pages.inc
+files[] = aggregator.parser.inc
Index: modules/aggregator/aggregator.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v
retrieving revision 1.389
diff -u -p -r1.389 aggregator.module
--- modules/aggregator/aggregator.module	16 Aug 2008 14:48:17 -0000	1.389
+++ modules/aggregator/aggregator.module	20 Aug 2008 20:29:10 -0000
@@ -456,119 +456,6 @@ function aggregator_remove($feed) {
 }
 
 /**
- * Callback function used by the XML parser.
- */
-function aggregator_element_start($parser, $name, $attributes) {
-  global $item, $element, $tag, $items, $channel;
-
-  switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
-    case 'CONTENT':
-    case 'SUMMARY':
-    case 'TAGLINE':
-    case 'SUBTITLE':
-    case 'LOGO':
-    case 'INFO':
-      $element = $name;
-      break;
-    case 'ID':
-      if ($element != 'ITEM') {
-        $element = $name;
-      }
-    case 'LINK':
-      if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') {
-        if ($element == 'ITEM') {
-          $items[$item]['LINK'] = $attributes['HREF'];
-        }
-        else {
-          $channel['LINK'] = $attributes['HREF'];
-        }
-      }
-      break;
-    case 'ITEM':
-      $element = $name;
-      $item += 1;
-      break;
-    case 'ENTRY':
-      $element = 'ITEM';
-      $item += 1;
-      break;
-  }
-
-  $tag = $name;
-}
-
-/**
- * Call-back function used by the XML parser.
- */
-function aggregator_element_end($parser, $name) {
-  global $element;
-
-  switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
-    case 'ITEM':
-    case 'ENTRY':
-    case 'CONTENT':
-    case 'INFO':
-      $element = '';
-      break;
-    case 'ID':
-      if ($element == 'ID') {
-        $element = '';
-      }
-  }
-}
-
-/**
- * Callback function used by the XML parser.
- */
-function aggregator_element_data($parser, $data) {
-  global $channel, $element, $items, $item, $image, $tag;
-  $items += array($item => array());
-  switch ($element) {
-    case 'ITEM':
-      $items[$item] += array($tag => '');
-      $items[$item][$tag] .= $data;
-      break;
-    case 'IMAGE':
-    case 'LOGO':
-      $image += array($tag => '');
-      $image[$tag] .= $data;
-      break;
-    case 'LINK':
-      if ($data) {
-        $items[$item] += array($tag => '');
-        $items[$item][$tag] .= $data;
-      }
-      break;
-    case 'CONTENT':
-      $items[$item] += array('CONTENT' => '');
-      $items[$item]['CONTENT'] .= $data;
-      break;
-    case 'SUMMARY':
-      $items[$item] += array('SUMMARY' => '');
-      $items[$item]['SUMMARY'] .= $data;
-      break;
-    case 'TAGLINE':
-    case 'SUBTITLE':
-      $channel += array('DESCRIPTION' => '');
-      $channel['DESCRIPTION'] .= $data;
-      break;
-    case 'INFO':
-    case 'ID':
-    case 'TEXTINPUT':
-      // The sub-element is not supported. However, we must recognize
-      // it or its contents will end up in the item array.
-      break;
-    default:
-      $channel += array($tag => '');
-      $channel[$tag] .= $data;
-  }
-}
-
-/**
  * Checks a news feed for new items.
  *
  * @param $feed
@@ -636,7 +523,7 @@ function aggregator_refresh($feed) {
 
         $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag'];
         // Update the feed data.
-        db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $md5, $etag, $modified, $feed['fid']);
+        db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['link'], $channel['description'], $image, $md5, $etag, $modified, $feed['fid']);
 
         // Clear the cache.
         cache_clear_all();
@@ -707,108 +594,39 @@ function aggregator_parse_w3cdtf($date_s
  *   FALSE on error, TRUE otherwise.
  */
 function aggregator_parse_feed(&$data, $feed) {
-  global $items, $image, $channel;
-
-  // Unset the global variables before we use them.
-  unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
-  $items = array();
-  $image = array();
-  $channel = array();
-
-  // Parse the data.
-  $xml_parser = drupal_xml_parser_create($data);
-  xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
-  xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
-
-  if (!xml_parse($xml_parser, $data, 1)) {
-    watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
-    drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
-    return FALSE;
-  }
-  xml_parser_free($xml_parser);
-
-  // We reverse the array such that we store the first item last, and the last
-  // item first. In the database, the newest item should be at the top.
-  $items = array_reverse($items);
-
-  // Initialize variables.
-  $title = $link = $author = $description = $guid = NULL;
-  foreach ($items as $item) {
-    unset($title, $link, $author, $description, $guid);
-
-    // Prepare the item:
-    foreach ($item as $key => $value) {
-      $item[$key] = trim($value);
-    }
-
-    // Resolve the item's title. If no title is found, we use up to 40
-    // characters of the description ending at a word boundary, but not
-    // splitting potential entities.
-    if (!empty($item['TITLE'])) {
-      $title = $item['TITLE'];
-    }
-    elseif (!empty($item['DESCRIPTION'])) {
-      $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40));
-    }
-    else {
-      $title = '';
-    }
-
-    // Resolve the items link.
-    if (!empty($item['LINK'])) {
-      $link = $item['LINK'];
-    }
-    else {
-      $link = $feed['link'];
-    }
-    $guid = isset($item['GUID']) ? $item['GUID'] : '';
-
-    // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag.
-    if (!empty($item['CONTENT:ENCODED'])) {
-      $item['DESCRIPTION'] = $item['CONTENT:ENCODED'];
-    }
-    elseif (!empty($item['SUMMARY'])) {
-      $item['DESCRIPTION'] = $item['SUMMARY'];
-    }
-    elseif (!empty($item['CONTENT'])) {
-      $item['DESCRIPTION'] = $item['CONTENT'];
-    }
-
-    // Try to resolve and parse the item's publication date. If no date is
-    // found, use the current date instead.
-    $date = 'now';
-    foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) {
-      if (!empty($item[$key])) {
-        $date = $item[$key];
-        break;
-      }
-    }
-
-    $timestamp = strtotime($date); // As of PHP 5.1.0, strtotime returns FALSE on failure instead of -1.
-
-    if ($timestamp <= 0) {
-      $timestamp = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
-      if (!$timestamp) {
-        // Better than nothing.
-        $timestamp = time();
-      }
+  global $image, $channel;
+  
+  @$data = simplexml_load_string($data);
+  if (drupal_function_exists('aggregator_parser_format_detect')) {
+    $format = aggregator_parser_format_detect($data);
+    if ($format == FALSE) {
+      watchdog('aggregator', 'The feed from %site seems to be broken.', array('%site' => $feed['title']), WATCHDOG_WARNING);
+      drupal_set_message(t('The feed from %site seems to be broken.', array('%site' => $feed['title'])), 'error');
+      return FALSE;
+    }
+    $feed_handler = 'aggregator_parser_' . $format;
+    if (drupal_function_exists($feed_handler)) {
+      $parser_out = $feed_handler($data);
     }
+  }
 
-    // Save this item. Try to avoid duplicate entries as much as possible. If
-    // we find a duplicate entry, we resolve it and pass along its ID is such
-    // that we can update it if needed.
-    if (!empty($guid)) {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $guid));
+  foreach ($parser_out['items'] as $item) {
+    if (!empty($item['guid'])) {
+      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $item['guid']));
     }
-    else if ($link && $link != $feed['link'] && $link != $feed['url']) {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link));
+    else if ($item['link'] && $item['link'] != $feed['link'] && $item['link'] != $feed['url']) {
+      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $item['link']));
     }
     else {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title));
+      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $item['title']));
     }
-    $item += array('AUTHOR' => '', 'DESCRIPTION' => '');
-    aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid:  ''), 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $guid));
+    $item += array('author' => '', 'description' => '');
+    aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid:  ''), 'fid' => $feed['fid'], 'timestamp' => $item['timestamp'], 'title' => $item['title'], 'link' => $item['link'], 'author' => $item['author'], 'description' => $item['description'], 'guid' => $item['guid']));
   }
+  
+  unset($parser_out['items']);
+  $channel = $parser_out;
+  $image = array();
 
   // Remove all items that are older than flush item timer.
   $age = time() - variable_get('aggregator_clear', 9676800);
Index: modules/aggregator/aggregator.parser.inc
===================================================================
RCS file: modules/aggregator/aggregator.parser.inc
diff -N modules/aggregator/aggregator.parser.inc
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ modules/aggregator/aggregator.parser.inc	20 Aug 2008 20:29:10 -0000
@@ -0,0 +1,271 @@
+<?php
+// $Id$
+
+/**
+ * @file
+ *   Various helper functions for feed parsing
+ */
+
+/**
+ * Detects a feed's format.
+ */
+function aggregator_parser_format_detect($data) {
+  if (is_object($data)) {
+    $attr = $data->attributes();
+    $type = strtolower($data->getName());
+    if (isset($data->entry) || $type == "feed") {
+      return "atom";
+    }
+    if ($type == "rdf" && isset($data->channel)) {
+      return "rdf";
+    }
+    if ($type == "rss" && in_array($attr["version"], array('0.91', "0.92", "2.0"))) {
+      return "rss";
+    }
+  }
+  return FALSE;
+}
+
+/**
+ * Parses RSS 2.0, 0.91, 0.92 feeds.
+ */
+function aggregator_parser_rss(SimpleXMLElement $data) {
+  $feed = array();
+  $dc = $data->channel->children('http://purl.org/dc/elements/1.1/');
+  $feed['title'] = _aggregator_parser_choose("{$data->channel->title}", "{$dc->title}");
+  $feed['description'] = _aggregator_parser_choose("{$data->channel->description}", "{$dc->subject}");
+  $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+  $feed['image'] = isset($data->channel->image->url) ? "{$data->channel->image->url}" : '';
+  $feed['items'] = array();
+  $category_splitter = '.';
+  foreach ($data->xpath('//item') as $news) {
+    // Get important namespaces.
+    $content = $news->children('http://purl.org/rss/1.0/modules/content/');
+		$dc = $news->children('http://purl.org/dc/elements/1.1/');
+		$item = array();
+		$item['guid'] = isset($news->guid) ? "{$news->guid}" : NULL;
+		$item['title'] = _aggregator_parser_choose("{$news->title}", "{$dc->title}");
+		$item['description'] = _aggregator_parser_choose("{$news->description}", "{$news->encoded}", "{$content->encoded}", "{$dc->description}");
+		$item['link'] = _aggregator_parser_choose("{$news->link}");
+		$item['timestamp'] = _aggregator_parse_date("{$news->pubDate}");
+		$item['categories'] = array();
+		if (isset($news->category)) {
+			foreach ($news->category as $cat) {
+				if (is_object($cat)) {
+					$item['categories'][] = trim(strip_tags("$cat"));
+				}
+				else {
+					foreach (explode($category_splitter, $cat) as $tag) {
+						$item['categories'][] = $tag;
+					}
+				}
+			}
+		}
+		$item['categories'] = array_unique($item['categories']);
+		$item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+		$item['enclosures'] = aggregator_parser_extract_enclosures($news);
+		$feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Parses Atom 1.0 feeds.
+ */
+function aggregator_parser_atom(SimpleXMLElement $data) {
+  $feed = array();
+  $feed['title'] = isset($data->title) ? "{$data->title}" : "";
+  $feed['description'] = isset($data->subtitle) ? "{$data->subtitle}" : "";
+  $feed['link'] = '';
+  if (count($data->link) > 0) {
+    $link = $data->link;
+    $link = $link->attributes();
+    $feed['link'] = isset($link["href"]) ? "{$link["href"]}" : "";
+  }
+  $feed->items = array();
+  foreach ($data->entry as $news) {
+    $item = array();
+    $item['guid'] = !empty($news->id) ? "{$news->id}" : NULL;
+    
+    $link_element = "{$news->link}";
+    $link_guid = valid_url($item['guid']) ? $item['guid'] : '';
+    $item['link'] = _aggregator_parser_choose($link_element, $link_guid);
+    $item['title'] = "{$news->title}";
+    $body = '';
+    if (!empty($news->content)) {
+      foreach ($news->content->children() as $child)  {
+        $body .= $child->asXML();
+      }
+      $body .= "{$news->content}";
+    }
+    else if (!empty($news->summary)) {
+      foreach ($news->summary->children() as $child)  {
+        $body .= $child->asXML();
+      }
+      $body .= "{$news->summary}";
+    }
+    $item['description'] = $body;
+    $item['timestamp'] = _aggregator_parse_date("{$news->published}");
+    $item['categories'] = array();
+    if (isset($news->category)) {
+			foreach ($news->category as $category)
+				$item['categories'][] = trim(strip_tags("{$category['term']}"));
+		}
+		$item['categories'] = array_unique($item['categories']);
+		$item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+		$item['enclosures'] = aggregator_parser_extract_enclosures($news);
+    $feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Parses RDF feeds.
+ */
+function aggregator_parser_rdf(SimpleXMLElement $data) {
+  $feed = array();
+  $feed['title'] = isset($data->channel->title) ? "{$data->channel->title}" : "";
+  $feed['description'] = isset($data->channel->description) ? "{$data->channel->description}" : "";
+  $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+  $namespaces = $data->getNamespaces(TRUE);
+  // Set category splitter (space is for del.icio.us feed).
+  $category_splitter = ' ';
+  $feed['items'] = array();
+  foreach ($data->item as $news) {
+    // Initialization.
+    $id = $original_url = NULL;
+    $title = $body = '';
+    $categories = array();
+    foreach ($namespaces as $ns_link) {
+      // Get about attribute as guid.
+      foreach ($news->attributes($ns_link) as $name => $value) {
+        if ($name == 'about') {
+          $id = "{$value}";
+        }
+      }
+
+      // Get children for current namespace.
+      if (version_compare(phpversion(), '5.1.2', '<')) {
+        $ns = (array) $news;
+      }
+      else {
+        $ns = (array) $news->children($ns_link);
+      }
+
+      // Title
+      if (!empty($ns['title'])) {
+        $title = "{$ns['title']}";
+      }
+
+      // Description or dc:description
+      if (!empty($ns['description']) && $body == '') {
+        $body = "{$ns['description']}";
+      }
+
+      // Link
+      if (!empty($ns['link'])) {
+        $link = "{$ns['link']}";
+      }
+
+      // content:encoded
+      if (!empty($ns['encoded'])) {
+        $body = "{$ns['encoded']}";
+      }
+      
+      $time_in = (empty($ns['pubDate']) ? (empty($ns['date']) ? '' : "{$ns['date']}")  : "{$ns['pubDate']}");
+      $timestamp = _aggregator_parse_date($time_in);
+
+      // dc:subject
+      if (!empty($ns['subject'])) {
+        // there can be multiple category tags
+        if (is_array($ns['subject'])) {
+          foreach ($ns['subject'] as $cat) {
+            if (is_object($cat)) {
+              $categories[] = trim(strip_tags($cat->asXML()));
+            }
+            else {
+              $categories[] = $cat;
+            }
+          }
+        }
+        else { //or single tag
+          $categories = explode($category_splitter, "{$ns['subject']}");
+        }
+      }
+    }
+    if (empty($original_url) && !empty($id)) {
+      $original_url = $id;
+    }
+    $item = array();
+    $item['title'] = $title;
+    $item['description'] = $body;
+    $item['timestamp'] = $timestamp;
+    $item['link'] = isset($link) ? $link : '';
+    $item['guid'] = $id;
+    $item['categories'] = $categories;
+    $item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE));
+    $item['enclosures'] = aggregator_parser_extract_enclosures($news);
+    $feed['items'][] = $item;
+  }
+  return $feed;
+}
+
+/**
+ * Extracts all the namespace-contained information to ->namespaces structure.
+ */
+function aggregator_parser_extract_namespaces(SimpleXMLElement $item, $namespaces) {
+  $result = array();
+  foreach ($namespaces as $prefix => $url) {
+    $ns = (array) $item->children($url);
+    if (!(empty($ns) || empty($prefix))) {
+      $result[$prefix] = $ns;
+    }
+  }
+  return $result;
+}
+
+/**
+ * Extracts all enclosures inside an item.
+ */
+function aggregator_parser_extract_enclosures(SimpleXMLElement $item) {
+  $result = array();
+  @$item = simplexml_load_string($item->asXML());
+  $possible_enclosures = $item->xpath("//enclosure") + $item->xpath("//link[@rel='enclosure']");
+  foreach ($possible_enclosures as $enc) {
+    $add_enc = array();
+    foreach ($enc->attributes() as $k => $v) {
+      $add_enc[$k] = "{$v}";
+    }
+    $result[] = $add_enc;
+  }
+  return $result;
+}
+
+/**
+ * Chooses the first argument which is not empty and return with it.
+ */
+function _aggregator_parser_choose() {
+  $args = func_get_args();
+  foreach ($args as $arg) {
+    if (strlen($arg) > 1) {
+      return $arg;
+    }
+  }
+  return '';
+}
+
+/**
+ * Parses a date comes from a feed.
+ *
+ * @param $date_string
+ *   The date string in various formats.
+ * @return
+ *   The timestamp of the string or the current time if can't be parsed
+ */
+function _aggregator_parse_date($date_str) {
+  $parsed_date = strtotime($date_str);
+  if ($parsed_date === FALSE || $parsed_date == -1) {
+    $parsed_date = aggregator_parse_w3cdtf($date_str);
+  }
+  return $parsed_date === FALSE ? time() : $parsed_date;
+}