Index: aggregator.module =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator.module,v retrieving revision 1.263 diff -u -w -b -r1.263 aggregator.module --- aggregator.module 2 Dec 2005 15:21:01 -0000 1.263 +++ aggregator.module 9 Dec 2005 01:27:48 -0000 @@ -265,64 +265,114 @@ * Call-back function used by the XML parser. */ function aggregator_element_start($parser, $name, $attributes) { - global $item, $element, $tag, $items, $channel; - + global $feed_info, $current_item, $current_tag, $current_item_categories; switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'CONTENT': - case 'SUMMARY': - case 'TAGLINE': - case 'SUBTITLE': - case 'LOGO': - case 'INFO': - $element = $name; - break; - case 'ID': - if ($element != 'ITEM') { - $element = $name; - } + // these two are unnecessary, but leave open the possibility of type-specific hooks + case 'RSS': + case 'FEED': + $feed_info['type'] = $name; + break; + // atom...in rss link tags are caught with the default case case 'LINK': if ($attributes['REL'] == 'alternate') { - if ($element == 'ITEM') { - $items[$item]['LINK'] = $attributes['HREF']; + if ($feed_info['state']) { + $current_item['LINK'] = $attributes['HREF']; } else { - $channel['LINK'] = $attributes['HREF']; + $feed_info['LINK'] = $attributes['HREF']; } } break; - case 'ITEM': - $element = $name; - $item += 1; + case 'ENCLOSURE': + if ($feed_info['state']) { + $current_item['ENCLOSURE'] = serialize($attributes); + } break; - case 'ENTRY': - $element = 'ITEM'; - $item += 1; + case 'CATEGORY': + case 'DC:SUBJECT': + $current_item_categories[] = array(); + $cat_count = count($current_item_categories); + $cat_count = $cat_count ? $cat_count - 1 : 0; + $current_item_categories[$cat_count] = array_merge($current_item_categories[$cat_count], $attributes); + break; + case 'ITEM': //rss + case 'ENTRY': // atom + $feed_info['state'] = 'in entry'; + $current_item = array(); + unset($current_item_categories); + break; + // next two are for atom...rss uses description tags, which are caught with the default case + case 'SUMMARY': + $feed_info['state'] = 'in summary'; + break; + case 'CONTENT': + $feed_info['state'] = 'in content'; + break; + case 'IMAGE': //rss + $feed_info['state'] = 'in image'; break; + case 'TEXTINPUT': //rss + $feed_info['state'] = 'in textinput'; + break; + default: + $tag = strtolower($name); + // reconstruct any tags that would be treated as XML when the content type is + // application/xhtml+xml and appends it to the field holding the content + foreach($attributes as $key => $val) $attr[] = strtolower($key) . "=\"$val\""; + if (count($attr)) { + $attrs = ' ' . implode(' ', $attr); + } else { + $attrs = ''; + } + if (($name == 'BR') OR ($name == 'HR') OR ($name == 'IMG')) { + $close_slash = ' /'; + } + if ($feed_info['state'] == 'in content') { + $current_item['CONTENT'] .= "<$tag$attrs$close_slash>"; + } else if ($feed_info['state'] == 'in summary') { + $current_item['SUMMARY'] .= "<$tag$attrs$close_slash>"; } - - $tag = $name; + break; + } + $current_tag = $name; } /** * Call-back function used by the XML parser. */ function aggregator_element_end($parser, $name) { - global $element; - + global $feed_info, $current_item, $current_item_categories; switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': case 'ITEM': case 'ENTRY': + $feed_info['ITEM'][] = $current_item; + unset($GLOBALS['current_item_categories']); + unset($GLOBALS['current_item']); + $feed_info['state'] = ''; + break; + case 'CATEGORY': + case 'DC:SUBJECT': + $current_item['CATEGORIES'] = serialize($current_item_categories); + break; case 'CONTENT': - case 'INFO': - $element = ''; + case 'SUMMARY': + $feed_info['state'] = 'in entry'; break; - case 'ID': - if ($element == 'ID') { - $element = ''; + case 'IMAGE': + case 'TEXTINPUT': + $feed_info['state'] = ''; + break; + case 'IMG': + case 'BR': + case 'HR': + break; + default: + if ($feed_info['state'] == 'in content') { + $current_item['CONTENT'] .= ''; + } else if ($feed_info['state'] == 'in summary') { + $current_item['SUMMARY'] .= ''; + } else if ($current_tag == 'DESCRIPTION') { + $current_item['DESCRIPTION'] .= ''; } } } @@ -331,39 +381,27 @@ * Call-back function used by the XML parser. */ function aggregator_element_data($parser, $data) { - global $channel, $element, $items, $item, $image, $tag; - - switch ($element) { - case 'ITEM': - $items[$item][$tag] .= $data; - break; - case 'IMAGE': - case 'LOGO': - $image[$tag] .= $data; - break; - case 'LINK': - if ($data) { - $items[$item][$tag] .= $data; + global $feed_info, $current_item, $current_tag, $current_item_categories; + if(trim($data)) { + if (($current_tag == 'CATEGORY') OR ($current_tag == 'DC:SUBJECT')) { + $cat_index = count($current_item_categories); + $cat_index = $cat_index ? $cat_index - 1 : 0; + $current_item_categories[$cat_index]['CATEGORYNAME'] .= $data; + } else if ($feed_info['state'] == 'in content') { + $current_item['CONTENT'] .= $data; + } else if ($feed_info['state'] == 'in summary') { + $current_item['SUMMARY'] .= $data; + } else if ($feed_info['state'] == 'in entry') { + $current_item[$current_tag] .= $data; + } else if ($feed_info['state'] == 'in image') { + if ((($current_tag == 'URL') AND ($feed_info['type'] == 'RSS')) OR ($feed_info['type'] == 'FEED')){ + $feed_info['IMAGE'] .= $data; + } + } else if ($feed_info['state'] == 'in textinput') { + // ignore it + } else { + $feed_info[$current_tag] .= $data; } - break; - case 'CONTENT': - $items[$item]['CONTENT'] .= $data; - break; - case 'SUMMARY': - $items[$item]['SUMMARY'] .= $data; - break; - case 'TAGLINE': - case 'SUBTITLE': - $channel['DESCRIPTION'] .= $data; - break; - case 'INFO': - case 'ID': - case 'TEXTINPUT': - // The sub-element is not supported. However, we must recognize - // it or its contents will end up in the item array. - break; - default: - $channel[$tag] .= $data; } } @@ -371,7 +409,7 @@ * Checks a news feed for new items. */ function aggregator_refresh($feed) { - global $channel, $image; + global $feed_info, $image; // Generate conditional GET headers. $headers = array(); @@ -401,48 +439,42 @@ case 307: // Filter the input data: if (aggregator_parse_feed($result->data, $feed)) { - if ($result->headers['Last-Modified']) { $modified = strtotime($result->headers['Last-Modified']); } - - /* - ** Prepare the channel data: - */ - - foreach ($channel as $key => $value) { - $channel[$key] = trim(strip_tags($value)); + // Prepare the channel data: + foreach ($feed_info as $key => $value) { + if (!is_array($feed_info[$key])) $feed_info[$key] = trim(strip_tags($value)); } - - /* - ** Prepare the image data (if any): - */ - - foreach ($image as $key => $value) { - $image[$key] = trim($value); + if ($feed_info['SUBTITLE']) { + $feed_info['DESCRIPTION'] = $feed_info['SUBTITLE']; } - - if ($image['LINK'] && $image['URL'] && $image['TITLE']) { - $image = ''. $image['TITLE'] .''; + else if ($feed_info['TAGLINE']) { + $feed_info['DESCRIPTION'] = $feed_info['TAGLINE']; + } + if ($feed_info['image']) { + $image = ''. $feed_info['TITLE'] .''; } else { $image = NULL; } + if ($feed_info['COPYRIGHT']) { + $copyright = $feed_info['COPYRIGHT']; + } + else if ($feed_info['DC:RIGHTS']) { + $copyright = $feed_info['DC:RIGHTS']; + } - /* - ** Update the feed data: - */ - - db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $result->headers['ETag'], $modified, $feed['fid']); - - /* - ** Clear the cache: - */ - + // Update the feed data: + db_query("UPDATE {aggregator_feed} SET + url = '%s', checked = %d, link = '%s', description = '%s', + copyright = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", + $feed['url'], time(), $feed_info['LINK'], $feed_info['DESCRIPTION'], + $copyright, $image, $result->headers['ETag'], $modified, $feed['fid']); + // Clear the cache: cache_clear_all(); - - watchdog('aggregator', t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['#title'])))); - drupal_set_message(t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['#title'])))); + watchdog('aggregator', t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['title'])))); + drupal_set_message(t('There is new syndicated content from %site.', array('%site' => theme('placeholder', $feed['title'])))); } break; default: @@ -489,14 +521,10 @@ } function aggregator_parse_feed(&$data, $feed) { - global $items, $image, $channel; - - // Unset the global variables before we use them: - unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); - $items = array(); - $image = array(); - $channel = array(); - + global $feed_info, $current_item, $current_tag, $current_categories; + $feed_info = array(); + $current_item = array(); + $current_tag = ''; // parse the data: $xml_parser = drupal_xml_parser_create($data); xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); @@ -515,16 +543,38 @@ ** should be at the top. */ + $items = $feed_info['ITEM']; $items = array_reverse($items); foreach ($items as $item) { - unset($title, $link, $author, $description); - + unset($title, $link, $author); + /** + * Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag, + * Feedburner feeds may have ALL of them + */ + if ($item['CONTENT']) { + $item['DESCRIPTION'] = $item['CONTENT']; + } + else if ($item['SUMMARY']) { + $item['DESCRIPTION'] = $item['SUMMARY']; + } // Prepare the item: foreach ($item as $key => $value) { $value = decode_entities(trim($value)); - $value = strip_tags($value, variable_get('aggregator_allowed_html_tags', '