Index: aggregator.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator.module,v
retrieving revision 1.263
diff -u -w -b -r1.263 aggregator.module
--- aggregator.module	2 Dec 2005 15:21:01 -0000	1.263
+++ aggregator.module	9 Dec 2005 01:27:48 -0000
@@ -265,64 +265,114 @@
  * Call-back function used by the XML parser.
  */
 function aggregator_element_start($parser, $name, $attributes) {
-  global $item, $element, $tag, $items, $channel;
-
+  global $feed_info, $current_item, $current_tag, $current_item_categories;
   switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
-    case 'CONTENT':
-    case 'SUMMARY':
-    case 'TAGLINE':
-    case 'SUBTITLE':
-    case 'LOGO':
-    case 'INFO':
-      $element = $name;
-      break;
-    case 'ID':
-      if ($element != 'ITEM') {
-        $element = $name;
-      }
+    // these two are unnecessary, but leave open the possibility of type-specific hooks
+    case 'RSS':
+    case 'FEED':
+      $feed_info['type'] = $name;
+      break;
+    // atom...in rss link tags are caught with the default case
     case 'LINK':
       if ($attributes['REL'] == 'alternate') {
-        if ($element == 'ITEM') {
-          $items[$item]['LINK'] = $attributes['HREF'];
+        if ($feed_info['state']) {
+          $current_item['LINK'] = $attributes['HREF'];
         }
         else {
-          $channel['LINK'] = $attributes['HREF'];
+        $feed_info['LINK'] = $attributes['HREF'];
         }
       }
       break;
-    case 'ITEM':
-      $element = $name;
-      $item += 1;
+    case 'ENCLOSURE':
+      if ($feed_info['state']) {
+        $current_item['ENCLOSURE'] = serialize($attributes);
+      }
       break;
-    case 'ENTRY':
-      $element = 'ITEM';
-      $item += 1;
+    case 'CATEGORY':
+    case 'DC:SUBJECT':
+      $current_item_categories[] = array();
+      $cat_count = count($current_item_categories);
+      $cat_count = $cat_count ? $cat_count - 1 : 0;
+      $current_item_categories[$cat_count] = array_merge($current_item_categories[$cat_count], $attributes);
+      break;
+    case 'ITEM': //rss
+    case 'ENTRY': // atom
+      $feed_info['state'] = 'in entry';
+      $current_item = array();
+      unset($current_item_categories);
+      break;
+    // next two are for atom...rss uses description tags, which are caught with the default case
+    case 'SUMMARY':
+      $feed_info['state'] = 'in summary';
+      break;
+    case 'CONTENT':
+      $feed_info['state'] = 'in content';
+      break;
+    case 'IMAGE': //rss
+      $feed_info['state'] = 'in image';
       break;
+    case 'TEXTINPUT': //rss
+      $feed_info['state'] = 'in textinput';
+      break;
+    default:
+      $tag = strtolower($name);
+      // reconstruct any tags that would be treated as XML when the content type is
+      // application/xhtml+xml and appends it to the field holding the content
+      foreach($attributes as $key => $val) $attr[] = strtolower($key) . "=\"$val\"";
+      if (count($attr)) {
+        $attrs = ' ' . implode(' ', $attr);
+      } else {
+        $attrs = '';
+      }
+      if (($name == 'BR') OR ($name == 'HR') OR ($name == 'IMG')) {
+        $close_slash = ' /';
+      }
+      if ($feed_info['state'] == 'in content') {
+        $current_item['CONTENT'] .= "<$tag$attrs$close_slash>";
+      } else if ($feed_info['state'] == 'in summary') {
+        $current_item['SUMMARY'] .= "<$tag$attrs$close_slash>";
   }
-
-  $tag = $name;
+      break;
+  }
+  $current_tag = $name;
 }
 
 /**
  * Call-back function used by the XML parser.
  */
 function aggregator_element_end($parser, $name) {
-  global $element;
-
+  global $feed_info, $current_item, $current_item_categories;
   switch ($name) {
-    case 'IMAGE':
-    case 'TEXTINPUT':
     case 'ITEM':
     case 'ENTRY':
+      $feed_info['ITEM'][] = $current_item;
+      unset($GLOBALS['current_item_categories']);
+      unset($GLOBALS['current_item']);
+      $feed_info['state'] = '';
+      break;
+    case 'CATEGORY':
+    case 'DC:SUBJECT':
+      $current_item['CATEGORIES'] = serialize($current_item_categories);
+      break;
     case 'CONTENT':
-    case 'INFO':
-      $element = '';
+    case 'SUMMARY':
+      $feed_info['state'] = 'in entry';
       break;
-    case 'ID':
-      if ($element == 'ID') {
-      $element = '';
+    case 'IMAGE':
+    case 'TEXTINPUT':
+      $feed_info['state'] = '';
+      break;
+    case 'IMG':
+    case 'BR':
+    case 'HR':
+      break;
+    default:
+      if ($feed_info['state'] == 'in content') {
+        $current_item['CONTENT'] .= '</'.strtolower($name).'>';
+      } else if ($feed_info['state'] == 'in summary') {
+        $current_item['SUMMARY'] .= '</'.strtolower($name).'>';
+      } else if ($current_tag == 'DESCRIPTION') {
+        $current_item['DESCRIPTION'] .= '</'.strtolower($name).'>';
   }
 }
 }
@@ -331,39 +381,27 @@
  * Call-back function used by the XML parser.
  */
 function aggregator_element_data($parser, $data) {
-  global $channel, $element, $items, $item, $image, $tag;
-
-  switch ($element) {
-    case 'ITEM':
-      $items[$item][$tag] .= $data;
-      break;
-    case 'IMAGE':
-    case 'LOGO':
-      $image[$tag] .= $data;
-      break;
-    case 'LINK':
-      if ($data) {
-        $items[$item][$tag] .= $data;
+  global $feed_info, $current_item, $current_tag, $current_item_categories;
+  if(trim($data)) {
+    if (($current_tag == 'CATEGORY') OR ($current_tag == 'DC:SUBJECT')) {
+      $cat_index = count($current_item_categories);
+      $cat_index = $cat_index ? $cat_index - 1 : 0;
+      $current_item_categories[$cat_index]['CATEGORYNAME'] .= $data;
+    } else if ($feed_info['state'] == 'in content') {
+      $current_item['CONTENT'] .= $data;
+    } else if ($feed_info['state'] == 'in summary') {
+      $current_item['SUMMARY'] .= $data;
+    } else if ($feed_info['state'] == 'in entry') {
+      $current_item[$current_tag] .= $data;
+    } else if ($feed_info['state'] == 'in image') {
+      if ((($current_tag == 'URL') AND ($feed_info['type'] == 'RSS')) OR ($feed_info['type'] == 'FEED')){
+        $feed_info['IMAGE'] .= $data;
+      }
+    } else if ($feed_info['state'] == 'in textinput') {
+      // ignore it
+    } else {
+      $feed_info[$current_tag] .= $data;
       }
-      break;
-    case 'CONTENT':
-      $items[$item]['CONTENT'] .= $data;
-      break;
-    case 'SUMMARY':
-      $items[$item]['SUMMARY'] .= $data;
-      break;
-    case 'TAGLINE':
-    case 'SUBTITLE':
-      $channel['DESCRIPTION'] .= $data;
-      break;
-    case 'INFO':
-    case 'ID':
-    case 'TEXTINPUT':
-      // The sub-element is not supported. However, we must recognize
-      // it or its contents will end up in the item array.
-      break;
-    default:
-      $channel[$tag] .= $data;
   }
 }
 
@@ -371,7 +409,7 @@
  * Checks a news feed for new items.
  */
 function aggregator_refresh($feed) {
-  global $channel, $image;
+  global $feed_info, $image;
 
   // Generate conditional GET headers.
   $headers = array();
@@ -401,48 +439,42 @@
     case 307:
       // Filter the input data:
      if (aggregator_parse_feed($result->data, $feed)) {
-
         if ($result->headers['Last-Modified']) {
           $modified = strtotime($result->headers['Last-Modified']);
         }
-
-        /*
-        ** Prepare the channel data:
-        */
-
-        foreach ($channel as $key => $value) {
-          $channel[$key] = trim(strip_tags($value));
+        // Prepare the channel data:
+        foreach ($feed_info as $key => $value) {
+          if (!is_array($feed_info[$key])) $feed_info[$key] = trim(strip_tags($value));
         }
-
-        /*
-        ** Prepare the image data (if any):
-        */
-
-        foreach ($image as $key => $value) {
-          $image[$key] = trim($value);
+        if ($feed_info['SUBTITLE']) {
+          $feed_info['DESCRIPTION'] = $feed_info['SUBTITLE'];
         }
-
-        if ($image['LINK'] && $image['URL'] && $image['TITLE']) {
-          $image = '<a href="'. $image['LINK'] .'"><img src="'. $image['URL'] .'" alt="'. $image['TITLE'] .'" /></a>';
+        else if ($feed_info['TAGLINE']) {
+          $feed_info['DESCRIPTION'] = $feed_info['TAGLINE'];
+        }
+        if ($feed_info['image']) {
+          $image = '<a href="'. $feed_info['LINK'] .'"><img src="'. $feed_info['IMAGE'] .'" alt="'. $feed_info['TITLE'] .'" /></a>';
         }
         else {
           $image = NULL;
         }
+        if ($feed_info['COPYRIGHT']) {
+          $copyright = $feed_info['COPYRIGHT'];
+        }
+        else if ($feed_info['DC:RIGHTS']) {
+          $copyright = $feed_info['DC:RIGHTS'];
+        }
 
-        /*
-        ** Update the feed data:
-        */
-
-        db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $result->headers['ETag'], $modified, $feed['fid']);
-
-        /*
-        ** Clear the cache:
-        */
-
+        // Update the feed data:
+        db_query("UPDATE {aggregator_feed} SET
+          url = '%s', checked = %d, link = '%s', description = '%s',
+          copyright = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d",
+          $feed['url'], time(), $feed_info['LINK'], $feed_info['DESCRIPTION'],
+          $copyright, $image, $result->headers['ETag'], $modified, $feed['fid']);
+        // Clear the cache:
         cache_clear_all();
-
-        watchdog('aggregator', t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['#title']))));
-        drupal_set_message(t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['#title']))));
+        watchdog('aggregator', t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['title']))));
+        drupal_set_message(t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['title']))));
       }
       break;
     default:
@@ -489,14 +521,10 @@
 }
 
 function aggregator_parse_feed(&$data, $feed) {
-  global $items, $image, $channel;
-
-  // Unset the global variables before we use them:
-  unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
-  $items = array();
-  $image = array();
-  $channel = array();
-
+  global $feed_info, $current_item, $current_tag, $current_categories;
+  $feed_info = array();
+  $current_item = array();
+  $current_tag = '';
   // parse the data:
   $xml_parser = drupal_xml_parser_create($data);
   xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
@@ -515,16 +543,38 @@
   ** should be at the top.
   */
 
+  $items = $feed_info['ITEM'];
   $items = array_reverse($items);
 
   foreach ($items as $item) {
-    unset($title, $link, $author, $description);
-
+    unset($title, $link, $author);
+    /**
+     * Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag,
+     * Feedburner feeds may have ALL of them
+     */
+      if ($item['CONTENT']) {
+        $item['DESCRIPTION'] = $item['CONTENT'];
+      }
+      else if ($item['SUMMARY']) {
+        $item['DESCRIPTION'] = $item['SUMMARY'];
+      }
     // Prepare the item:
     foreach ($item as $key => $value) {
       $value = decode_entities(trim($value));
-      $value = strip_tags($value, variable_get('aggregator_allowed_html_tags', '<a> <b> <br> <dd> <dl> <dt> <em> <i> <li> <ol> <p> <strong> <u> <ul>'));
+      $value = preg_replace('/\Wstyle\s*=[^>]+?>/i', '>', $value);
+      $value = preg_replace('/\Wclass\s*=[^>]+?>/i', '>', $value);
+      $value = preg_replace('/\Won[a-z]+\s*=[^>]+?>/i', '>', $value);
+      if (stristr($key, 'CONTENT')) {
+        $item['DESCRIPTION'] = $value;
+      }
+      else if ($key != 'DESCRIPTION') {
       $value = filter_xss($value);
+      }
+      else {
+        $output_format = variable_get('aggregator_default_filter', FILTER_FORMAT_DEFAULT);
+        $value = check_markup($value, $output_format);
+      }
+      $item[$key] = $value;
       $item[$key] = $value;
     }
 
@@ -533,7 +583,6 @@
     ** up to 40 characters of the description ending at a word
     ** boundary but not splitting potential entities.
     */
-
     if ($item['TITLE']) {
       $title = $item['TITLE'];
     }
@@ -544,7 +593,6 @@
     /*
     ** Resolve the items link.
     */
-
     if ($item['LINK']) {
       $link = $item['LINK'];
     }
@@ -556,28 +604,22 @@
     }
 
     /**
-     * Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag
+     * Get author information if it's there
      */
-    if ($item['CONTENT']) {
-      $item['DESCRIPTION'] = $item['CONTENT'];
-    }
-    else if ($item['SUMMARY']) {
-      $item['DESCRIPTION'] = $item['SUMMARY'];
-    }
+    if ($item['DC:CREATOR']) $item['AUTHOR'] = $item['DC:CREATOR'];
+    else if($item['NAME']) $item['AUTHOR'] = $item['NAME'];
 
     /*
     ** Try to resolve and parse the item's publication date.  If no
     ** date is found, we use the current date instead.
     */
-
     if ($item['PUBDATE']) $date = $item['PUBDATE'];                        // RSS 2.0
     else if ($item['DC:DATE']) $date = $item['DC:DATE'];                   // Dublin core
     else if ($item['DCTERMS:ISSUED']) $date = $item['DCTERMS:ISSUED'];     // Dublin core
     else if ($item['DCTERMS:CREATED']) $date = $item['DCTERMS:CREATED'];   // Dublin core
     else if ($item['DCTERMS:MODIFIED']) $date = $item['DCTERMS:MODIFIED']; // Dublin core
-    else if ($item['ISSUED']) $date = $item['ISSUED'];                     // Atom XML
-    else if ($item['CREATED']) $date = $item['CREATED'];                   // Atom XML
-    else if ($item['MODIFIED']) $date = $item['MODIFIED'];                 // Atom XML
+    else if ($item['PUBLISHED']) $date = $item['PUBLISHED'];               // Atom
+    else if ($item['CREATED']) $date = $item['CREATED'];                   // Atom
     else $date = 'now';
 
     $timestamp = strtotime($date); // strtotime() returns -1 on failure
@@ -593,15 +635,18 @@
     ** possible.  If we find a duplicate entry, we resolve it and
     ** pass along it's ID such that we can update it if needed.
     */
-
     if ($link && $link != $feed['link'] && $link != $feed['url']) {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link));
+      $entry = db_fetch_object(db_query(
+      "SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link));
     }
     else {
-      $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title));
+      $entry = db_fetch_object(db_query(
+      "SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title));
     }
 
-    aggregator_save_item(array('iid' => $entry->iid, 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION']));
+    aggregator_save_item(array('iid' => $entry->iid, 'fid' => $feed['fid'], 'timestamp' => $timestamp,
+    'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'],
+    'categories' => $item['CATEGORIES'], 'enclosure' => $item['ENCLOSURE']));
   }
 
   /*
@@ -616,7 +661,6 @@
     while ($item = db_fetch_object($result)) {
       $items[] = $item->iid;
     }
-    db_query('DELETE FROM {aggregator_category_item} WHERE iid IN ('. implode(', ', $items) .')');
     db_query('DELETE FROM {aggregator_item} WHERE fid = %d AND timestamp < %d', $feed['fid'], $age);
   }
 
@@ -625,7 +669,7 @@
 
 function aggregator_save_item($edit) {
   if ($edit['iid'] && $edit['title']) {
-    db_query('UPDATE {aggregator_item} SET title = \'%s\', link = \'%s\', author = \'%s\', description = \'%s\' WHERE iid = %d', $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['iid']);
+    db_query('UPDATE {aggregator_item} SET title = \'%s\', link = \'%s\', author = \'%s\', description = \'%s\', categories = \'%s\', enclosure = \'%s\' WHERE iid = %d', $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['categories'], $edit['enclosure'], $edit['iid']);
   }
   else if ($edit['iid']) {
     db_query('DELETE FROM {aggregator_item} WHERE iid = %d', $edit['iid']);
@@ -633,7 +677,10 @@
   }
   else if ($edit['title'] && $edit['link']) {
     $edit['iid'] = db_next_id('{aggregator_item}_iid');
-    db_query('INSERT INTO {aggregator_item} (iid, fid, title, link, author, description, timestamp) VALUES (%d, %d, \'%s\', \'%s\', \'%s\', \'%s\', %d)', $edit['iid'], $edit['fid'], $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['timestamp']);
+    db_query('INSERT INTO {aggregator_item} (iid, fid, title, link, author, description, timestamp, categories, enclosure)
+              VALUES (%d, %d, \'%s\', \'%s\', \'%s\', \'%s\', %d, \'%s\', \'%s\')',
+    $edit['iid'], $edit['fid'], $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['timestamp']
+    , $edit['categories'], $edit['enclosure']);
     // file the items in the categories indicated by the feed
     $categories = db_query('SELECT cid FROM {aggregator_category_feed} WHERE fid = %d', $edit['fid']);
     while ($category = db_fetch_object($categories)) {
@@ -811,7 +858,6 @@
   return $output;
 }
 
-
 /**
  * Menu callback; displays the category edit form, or saves changes and
  * redirects to the overview page.
@@ -1015,8 +1061,6 @@
   drupal_goto($_GET['q']);
 }
 
-
-
 /**
  * Menu callback; displays all the feeds used by the aggregator.
  */
@@ -1216,7 +1260,6 @@
  */
 function theme_aggregator_page_item($item) {
   static $last;
-
   $date = format_date($item->timestamp, 'custom', 'Ymd');
   if ($date != $last) {
     $last = $date;
@@ -1226,30 +1269,41 @@
   $output .= "<div class=\"news-item\">\n";
   $output .= ' <div class="date">'. format_date($item->timestamp, 'custom', 'H:i') ."</div>\n";
   $output .= " <div class=\"body\">\n";
-
-  $source = '';
+  $output .= '  <div class="title"><a href="'. check_url($item->link) .'">'. check_plain($item->title) ."</a></div>\n";
+  if ($item->description) {
+    $output .= '  <div class="description">'.
+                $item->description
+                ."</div>\n";
+  }
   if ($item->ftitle && $item->fid) {
-    $source = '<span class="source">'. l($item->ftitle, "aggregator/sources/$item->fid") .":</span>\n";
+    $output .= '  <div class="source">'. t('Source') .': '. l($item->ftitle, "aggregator/sources/$item->fid") ."</div>\n";
   }
 
-  $output .= '  <div class="title">'. $source .'<a href="'. check_url($item->link) .'">'. check_plain($item->title) ."</a></div>\n";
-
-  if ($item->description) {
-    $output .= '  <div class="description">'. $item->description ."</div>\n";
+  if ($item->enclosure) {
+    $enclosure = unserialize($item->enclosure);
+    $output .=  "<div class=\"enclosure\">Download <a href=\"{$enclosure['URL']}\" alt=\""
+                . basename($enclosure['URL'])
+                . '" title="' . basename($enclosure['URL']) . '"'
+                . '">'
+                . basename($enclosure['URL'])
+                . "</a></div>";
+  }
+
+  if ($item->categories) {
+    $remote_categories = unserialize($item->categories);
+    if (count($remote_categories)) {
+      foreach ($remote_categories as $remote_category) {
+        if ($remote_category['DOMAIN']) {
+          $remote_category_list[] = "<a href=\"{$remote_category['DOMAIN']}\" alt=\"{$remote_category['CATEGORYNAME']}\"> {$remote_category['CATEGORYNAME']}</a>";
+        } else {
+          $remote_category_list[] = $remote_category['CATEGORYNAME'];
   }
-
-  $result = db_query('SELECT c.title, c.cid FROM {aggregator_category_item} ci LEFT JOIN {aggregator_category} c ON ci.cid = c.cid WHERE ci.iid = %d ORDER BY c.title', $item->iid);
-  $categories = array();
-  while ($category = db_fetch_object($result)) {
-    $categories[] = l($category->title, 'aggregator/categories/'. $category->cid);
   }
-  if ($categories) {
-    $output .= '  <div class="categories">'. t('Categories') .': '. implode(', ', $categories) ."</div>\n";
+      $output .= "  <div class=\"remotecategories\">$item->ftitle categories:". theme('links', $remote_category_list) ."</div>\n";
+    }
   }
-
   $output .= " </div>\n";
   $output .= "</div>\n";
-
   return $output;
 }