Index: modules/aggregator/aggregator.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.admin.inc,v retrieving revision 1.21 diff -u -p -r1.21 aggregator.admin.inc --- modules/aggregator/aggregator.admin.inc 8 Nov 2008 07:12:18 -0000 1.21 +++ modules/aggregator/aggregator.admin.inc 15 Nov 2008 16:27:07 -0000 @@ -54,7 +54,11 @@ function aggregator_view() { */ function aggregator_form_feed(&$form_state, $edit = array('refresh' => 900, 'block' => 5, 'title' => '', 'url' => '', 'fid' => NULL)) { $period = drupal_map_assoc(array(900, 1800, 3600, 7200, 10800, 21600, 32400, 43200, 64800, 86400, 172800, 259200, 604800, 1209600, 2419200), 'format_interval'); - + + if (is_object($edit)) { + $edit = (array)$edit; + } + if ($edit['refresh'] == '') { $edit['refresh'] = 3600; } @@ -200,7 +204,7 @@ function aggregator_admin_remove_feed($f '#value' => $feed, ), ), - t('Are you sure you want to remove all items from the feed %feed?', array('%feed' => $feed['title'])), + t('Are you sure you want to remove all items from the feed %feed?', array('%feed' => $feed->title)), 'admin/content/aggregator', t('This action cannot be undone.'), t('Remove items'), @@ -389,32 +393,51 @@ function aggregator_admin_refresh_feed($ * @see system_settings_form() */ function aggregator_admin_settings() { - $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items'); - $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval'); - - $form['aggregator_allowed_html_tags'] = array( - '#type' => 'textfield', '#title' => t('Allowed HTML tags'), '#size' => 80, '#maxlength' => 255, - '#default_value' => variable_get('aggregator_allowed_html_tags', '
      • '), - '#description' => t('A space-separated list of HTML tags allowed in the content of feed items. (Tags in this list are not removed by Drupal.)'), - ); + + // Get all available parsers. + $parsers = module_implements('aggregator_parse'); + foreach ($parsers as $k => $module) { + if ($info = module_invoke($module, 'aggregator_parse_info')) { + $label = $info['title'] . ' ' . $info['description'] . ''; + } + else { + $label = $module; + } + unset($parsers[$k]); + $parsers[$module] = $label; + } + + // Get all available processors. + $processors = module_implements('aggregator_process'); + foreach ($processors as $k => $module) { + if ($info = module_invoke($module, 'aggregator_process_info')) { + $label = $info['title'] . ' ' . $info['description'] . ''; + } + else { + $label = $module; + } + unset($processors[$k]); + $processors[$module] = $label; + } - $form['aggregator_summary_items'] = array( - '#type' => 'select', '#title' => t('Items shown in sources and categories pages') , - '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items, - '#description' => t('Number of feed items displayed in feed and category summary pages.'), + $form['aggregator_parser'] = array( + '#type' => 'radios', + '#title' => t('Parser'), + '#description' => t('Parsers retrieve and parse feed data. Choose one suitable for the type of feeds you would like to aggregate.'), + '#options' => $parsers, + '#default_value' => variable_get('aggregator_parser', ''), ); - $form['aggregator_clear'] = array( - '#type' => 'select', '#title' => t('Discard items older than'), - '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period, - '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured cron maintenance task.)', array('@cron' => url('admin/reports/status'))), + $form['aggregator_processors'] = array( + '#type' => 'checkboxes', + '#title' => t('Processors'), + '#description' => t('Processors act on parsed feed data, for example they store feed items. Pick the processors suitable for your task.'), + '#options' => $processors, + '#default_value' => variable_get('aggregator_processors', array()), ); - $form['aggregator_category_selector'] = array( - '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'), - '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')), - '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector works well with large numbers of categories.)'), - ); + // Implementing modules will expect an array at $form['modules']. + $form['modules'] = array(); return system_settings_form($form); } @@ -507,3 +530,42 @@ function aggregator_form_category_submit drupal_set_message(t('The category %category has been added.', array('%category' => $form_state['values']['title']))); } } + +/** + * Implementation of hook_form_alter(). + */ +function aggregator_form_alter(&$form, $form_state, $form_id) { + if ($form_id == 'aggregator_admin_settings') { + if (in_array('aggregator', aggregator_get_enabled_processors())) { + $info = module_invoke('aggregator', 'aggregator_process', 'info'); + $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items'); + $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval'); + + $form['modules']['aggregator'] = array( + '#type' => 'fieldset', + '#title' => t('Default processor settings'), + '#description' => $info['description'], + '#collapsible' => TRUE, + '#collapsed' => !in_array('aggregator', aggregator_get_enabled_processors()), + ); + + $form['modules']['aggregator']['aggregator_summary_items'] = array( + '#type' => 'select', '#title' => t('Items shown in sources and categories pages') , + '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items, + '#description' => t('Number of feed items displayed in feed and category summary pages.'), + ); + + $form['modules']['aggregator']['aggregator_clear'] = array( + '#type' => 'select', '#title' => t('Discard items older than'), + '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period, + '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured cron maintenance task.)', array('@cron' => url('admin/reports/status'))), + ); + + $form['modules']['aggregator']['aggregator_category_selector'] = array( + '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'), + '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')), + '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector works well with large numbers of categories.)'), + ); + } + } +} \ No newline at end of file Index: modules/aggregator/aggregator.info =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.info,v retrieving revision 1.9 diff -u -p -r1.9 aggregator.info --- modules/aggregator/aggregator.info 11 Oct 2008 02:32:33 -0000 1.9 +++ modules/aggregator/aggregator.info 15 Nov 2008 16:27:07 -0000 @@ -8,4 +8,6 @@ core = 7.x files[] = aggregator.module files[] = aggregator.admin.inc files[] = aggregator.pages.inc +files[] = aggregator.parser.inc +files[] = aggregator.processor.inc files[] = aggregator.install Index: modules/aggregator/aggregator.install =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.install,v retrieving revision 1.19 diff -u -p -r1.19 aggregator.install --- modules/aggregator/aggregator.install 15 Nov 2008 13:01:04 -0000 1.19 +++ modules/aggregator/aggregator.install 15 Nov 2008 16:27:08 -0000 @@ -7,6 +7,9 @@ function aggregator_install() { // Create tables. drupal_install_schema('aggregator'); + // Enable default parser and processors. + variable_set('aggregator_parser', 'aggregator'); + variable_set('aggregator_processors', array('aggregator')); } /** @@ -20,6 +23,8 @@ function aggregator_uninstall() { variable_del('aggregator_summary_items'); variable_del('aggregator_clear'); variable_del('aggregator_category_selector'); + variable_del('aggregator_parser'); + variable_del('aggregator_processors'); } /** Index: modules/aggregator/aggregator.module =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v retrieving revision 1.400 diff -u -p -r1.400 aggregator.module --- modules/aggregator/aggregator.module 15 Nov 2008 08:23:06 -0000 1.400 +++ modules/aggregator/aggregator.module 15 Nov 2008 16:27:08 -0000 @@ -20,6 +20,10 @@ function aggregator_help($path, $arg) { $output = '

        ' . t('Thousands of sites (particularly news sites and blogs) publish their latest headlines and posts in feeds, using a number of standardized XML-based formats. Formats supported by the aggregator include RSS, RDF, and Atom.', array('@rss' => 'http://cyber.law.harvard.edu/rss/', '@rdf' => 'http://www.w3.org/RDF/', '@atom' => 'http://www.atomenabled.org')) . '

        '; $output .= '

        ' . t('Current feeds are listed below, and new feeds may be added. For each feed or feed category, the latest items block may be enabled at the blocks administration page.', array('@addfeed' => url('admin/content/aggregator/add/feed'), '@block' => url('admin/build/block'))) . '

        '; return $output; + case 'admin/content/aggregator/aggregator/parser': + return ; + case 'admin/content/aggregator/aggregator/processor': + return ; case 'admin/content/aggregator/add/feed': return '

        ' . t('Add a feed in RSS, RDF or Atom format. A feed may only have one entry.') . '

        '; case 'admin/content/aggregator/add/category': @@ -289,7 +293,7 @@ function aggregator_perm() { function aggregator_cron() { $result = db_query('SELECT * FROM {aggregator_feed} WHERE checked + refresh < :time', array(':time' => REQUEST_TIME)); foreach ($result as $feed) { - aggregator_refresh((array)$feed); + aggregator_refresh($feed); } } @@ -326,8 +330,8 @@ function aggregator_block($op = 'list', elseif ($op == 'save') { list($type, $id) = explode('-', $delta); if ($type == 'category') { - db_merge('aggregator_category') - ->key(array('cid' => $id)) + db_update('aggregator_category') + ->condition('cid', $id) ->fields(array('block' => $edit['block'])) ->execute(); } @@ -406,6 +410,7 @@ function aggregator_save_category($edit) ->fields(array( 'title' => $edit['title'], 'description' => $edit['description'], + 'block' => 5, )) ->execute(); $op = 'insert'; @@ -495,7 +500,7 @@ function aggregator_save_feed($edit) { * An associative array describing the feed to be cleared. */ function aggregator_remove($feed) { - $iids = db_query('SELECT iid FROM {aggregator_item} WHERE fid = :fid', array(':fid' => $feed['fid']))->fetchCol(); + $iids = db_query('SELECT iid FROM {aggregator_item} WHERE fid = :fid', array(':fid' => $feed->fid))->fetchCol(); if ($iids) { db_delete('aggregator_category_item') ->condition('iid', $iids, 'IN') @@ -503,132 +508,19 @@ function aggregator_remove($feed) { } db_delete('aggregator_item') - ->condition('fid', $feed['fid']) + ->condition('fid', $feed->fid) ->execute(); db_merge('aggregator_feed') - ->key(array('fid' => $feed['fid'])) + ->key(array('fid' => $feed->fid)) ->fields(array( 'checked' => 0, 'hash' => '', 'modified' => 0, - 'description' => $feed['description'], - 'image' => $feed['image'], + 'description' => $feed->description, + 'image' => $feed->image, )) ->execute(); - drupal_set_message(t('The news items from %site have been removed.', array('%site' => $feed['title']))); -} - -/** - * Callback function used by the XML parser. - */ -function aggregator_element_start($parser, $name, $attributes) { - global $item, $element, $tag, $items, $channel; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'CONTENT': - case 'SUMMARY': - case 'TAGLINE': - case 'SUBTITLE': - case 'LOGO': - case 'INFO': - $element = $name; - break; - case 'ID': - if ($element != 'ITEM') { - $element = $name; - } - case 'LINK': - if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') { - if ($element == 'ITEM') { - $items[$item]['LINK'] = $attributes['HREF']; - } - else { - $channel['LINK'] = $attributes['HREF']; - } - } - break; - case 'ITEM': - $element = $name; - $item += 1; - break; - case 'ENTRY': - $element = 'ITEM'; - $item += 1; - break; - } - - $tag = $name; -} - -/** - * Call-back function used by the XML parser. - */ -function aggregator_element_end($parser, $name) { - global $element; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'ITEM': - case 'ENTRY': - case 'CONTENT': - case 'INFO': - $element = ''; - break; - case 'ID': - if ($element == 'ID') { - $element = ''; - } - } -} - -/** - * Callback function used by the XML parser. - */ -function aggregator_element_data($parser, $data) { - global $channel, $element, $items, $item, $image, $tag; - $items += array($item => array()); - switch ($element) { - case 'ITEM': - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - break; - case 'IMAGE': - case 'LOGO': - $image += array($tag => ''); - $image[$tag] .= $data; - break; - case 'LINK': - if ($data) { - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - } - break; - case 'CONTENT': - $items[$item] += array('CONTENT' => ''); - $items[$item]['CONTENT'] .= $data; - break; - case 'SUMMARY': - $items[$item] += array('SUMMARY' => ''); - $items[$item]['SUMMARY'] .= $data; - break; - case 'TAGLINE': - case 'SUBTITLE': - $channel += array('DESCRIPTION' => ''); - $channel['DESCRIPTION'] .= $data; - break; - case 'INFO': - case 'ID': - case 'TEXTINPUT': - // The sub-element is not supported. However, we must recognize - // it or its contents will end up in the item array. - break; - default: - $channel += array($tag => ''); - $channel[$tag] .= $data; - } + drupal_set_message(t('The news items from %site have been removed.', array('%site' => $feed->title))); } /** @@ -638,311 +530,19 @@ function aggregator_element_data($parser * An associative array describing the feed to be refreshed. */ function aggregator_refresh($feed) { - global $channel, $image; - - // Generate conditional GET headers. - $headers = array(); - if ($feed['etag']) { - $headers['If-None-Match'] = $feed['etag']; - } - if ($feed['modified']) { - $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT'; - } - - // Request feed. - $result = drupal_http_request($feed['url'], $headers); - - // Process HTTP response code. - switch ($result->code) { - case 304: - db_update('aggregator_feed') - ->fields(array('checked' => REQUEST_TIME)) - ->condition('fid', $feed['fid']) - ->execute(); - drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); - break; - case 301: - $feed['url'] = $result->redirect_url; - // Do not break here. - case 200: - case 302: - case 307: - // We store the md5 hash of feed data in the database. When refreshing a - // feed we compare stored hash and new hash calculated from downloaded - // data. If both are equal we say that feed is not updated. - $md5 = md5($result->data); - if ($feed['hash'] == $md5) { - db_update('aggregator_feed') - ->condition('fid', $feed['fid']) - ->fields(array('checked' => REQUEST_TIME)) - ->execute(); - drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); - break; - } - - // Filter the input data. - if (aggregator_parse_feed($result->data, $feed)) { - $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']); - - // Prepare the channel data. - foreach ($channel as $key => $value) { - $channel[$key] = trim($value); - } - - // Prepare the image data (if any). - foreach ($image as $key => $value) { - $image[$key] = trim($value); - } - - if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) { - $image = l(theme('image', $image['URL'], $image['TITLE']), $image['LINK'], array('html' => TRUE)); - } - else { - $image = ''; - } - - $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag']; - // Update the feed data. - db_merge('aggregator_feed') - ->key(array('fid' => $feed['fid'])) - ->fields(array( - 'url' => $feed['url'], - 'checked' => REQUEST_TIME, - 'link' => $channel['LINK'], - 'description' => $channel['DESCRIPTION'], - 'image' => $image, - 'hash' => $md5, - 'etag' => $etag, - 'modified' => $modified, - )) - ->execute(); - - // Clear the cache. - cache_clear_all(); - - if (isset($result->redirect_url)) { - watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])); - } - - watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title'])); - drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title']))); - } - break; - default: - watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error))); - module_invoke('system', 'check_http_request'); - } -} - -/** - * Parse the W3C date/time format, a subset of ISO 8601. - * - * PHP date parsing functions do not handle this format. - * See http://www.w3.org/TR/NOTE-datetime for more information. - * Originally from MagpieRSS (http://magpierss.sourceforge.net/). - * - * @param $date_str - * A string with a potentially W3C DTF date. - * @return - * A timestamp if parsed successfully or FALSE if not. - */ -function aggregator_parse_w3cdtf($date_str) { - if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) { - list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]); - // Calculate the epoch for current date assuming GMT. - $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year); - if ($match[10] != 'Z') { // Z is zulu time, aka GMT - list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]); - // Zero out the variables. - if (!$tz_hour) { - $tz_hour = 0; - } - if (!$tz_min) { - $tz_min = 0; - } - $offset_secs = (($tz_hour * 60) + $tz_min) * 60; - // Is timezone ahead of GMT? If yes, subtract offset. - if ($tz_mod == '+') { - $offset_secs *= -1; - } - $epoch += $offset_secs; - } - return $epoch; - } - else { - return FALSE; - } -} - -/** - * Parse a feed and store its items. - * - * @param $data - * The feed data. - * @param $feed - * An associative array describing the feed to be parsed. - * @return - * FALSE on error, TRUE otherwise. - */ -function aggregator_parse_feed(&$data, $feed) { - global $items, $image, $channel; - - // Unset the global variables before we use them. - unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); - $items = array(); - $image = array(); - $channel = array(); - - // Parse the data. - $xml_parser = drupal_xml_parser_create($data); - xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); - xml_set_character_data_handler($xml_parser, 'aggregator_element_data'); - - if (!xml_parse($xml_parser, $data, 1)) { - watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error'); - return FALSE; - } - xml_parser_free($xml_parser); - - // We reverse the array such that we store the first item last, and the last - // item first. In the database, the newest item should be at the top. - $items = array_reverse($items); - - // Initialize variables. - $title = $link = $author = $description = $guid = NULL; - foreach ($items as $item) { - unset($title, $link, $author, $description, $guid); - - // Prepare the item: - foreach ($item as $key => $value) { - $item[$key] = trim($value); - } - - // Resolve the item's title. If no title is found, we use up to 40 - // characters of the description ending at a word boundary, but not - // splitting potential entities. - if (!empty($item['TITLE'])) { - $title = $item['TITLE']; - } - elseif (!empty($item['DESCRIPTION'])) { - $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40)); - } - else { - $title = ''; - } - - // Resolve the items link. - if (!empty($item['LINK'])) { - $link = $item['LINK']; - } - else { - $link = $feed['link']; - } - $guid = isset($item['GUID']) ? $item['GUID'] : ''; - - // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag. - if (!empty($item['CONTENT:ENCODED'])) { - $item['DESCRIPTION'] = $item['CONTENT:ENCODED']; - } - elseif (!empty($item['SUMMARY'])) { - $item['DESCRIPTION'] = $item['SUMMARY']; - } - elseif (!empty($item['CONTENT'])) { - $item['DESCRIPTION'] = $item['CONTENT']; - } - - // Try to resolve and parse the item's publication date. - $date = ''; - foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) { - if (!empty($item[$key])) { - $date = $item[$key]; - break; - } - } - - $timestamp = strtotime($date); - - if ($timestamp === FALSE) { - $timestamp = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure. - } - - // Save this item. Try to avoid duplicate entries as much as possible. If - // we find a duplicate entry, we resolve it and pass along its ID is such - // that we can update it if needed. - if (!empty($guid)) { - $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND guid = :guid", array(':fid' => $feed['fid'], ':guid' => $guid))->fetchObject(); - } - elseif ($link && $link != $feed['link'] && $link != $feed['url']) { - $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND link = :link", array(':fid' => $feed['fid'], ':link' => $link))->fetchObject(); - } - else { - $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND title = :title", array(':fid' => $feed['fid'], ':title' => $title))->fetchObject(); - } - - if (!$timestamp) { - $timestamp = isset($entry->timestamp) ? $entry->timestamp : REQUEST_TIME; - } - $item += array('AUTHOR' => '', 'DESCRIPTION' => ''); - aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid : ''), 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $guid)); - } - - // Remove all items that are older than flush item timer. - $age = REQUEST_TIME - variable_get('aggregator_clear', 9676800); - $iids = db_query('SELECT iid FROM {aggregator_item} WHERE fid = :fid AND timestamp < :timestamp', array(':fid' => $feed['fid'], ':timestamp' => $age))->fetchCol(); - if ($iids) { - db_delete('aggregator_category_item') - ->condition('iid', $iids, 'IN') - ->execute(); - db_delete('aggregator_item') - ->condition('iid', $iids, 'IN') - ->execute(); - } - - return TRUE; -} - -/** - * Add/edit/delete an aggregator item. - * - * @param $edit - * An associative array describing the item to be added/edited/deleted. - */ -function aggregator_save_item($edit) { - if ($edit['title'] && empty($edit['iid'])) { - $edit['iid'] = db_insert('aggregator_item') - ->fields(array( - 'title' => $edit['title'], - 'link' => $edit['link'], - 'author' => $edit['author'], - 'description' => $edit['description'], - 'guid' => $edit['guid'], - 'timestamp' => $edit['timestamp'], - 'fid' => $edit['fid'], - )) - ->execute(); - } - if ($edit['iid'] && !$edit['title']) { - db_delete('aggregator_item') - ->condition('iid', $edit['iid']) - ->execute(); - db_delete('aggregator_category_item') - ->condition('iid', $edit['iid']) - ->execute(); - } - elseif ($edit['title'] && $edit['link']) { - // file the items in the categories indicated by the feed - $result = db_query('SELECT cid FROM {aggregator_category_feed} WHERE fid = :fid', array(':fid' => $edit['fid'])); - foreach ($result as $category) { - db_merge('aggregator_category_item') - ->fields(array( - 'cid' => $category->cid, - 'iid' => $edit['iid'], - )) - ->execute(); + // Parse the feed. + $parser = aggregator_get_enabled_parser(); + module_invoke($parser, 'aggregator_parse', $feed); + + // If there are items on the feed, let all enabled processors do their work on it. + if (@count($feed->items)) { + $processors = aggregator_get_enabled_processors(); + foreach ($processors as $processor) { + module_invoke($processor, 'aggregator_process', $feed); } } + // Expire old feed items. + aggregator_expire($feed); } /** @@ -956,7 +556,7 @@ function aggregator_save_item($edit) { function aggregator_feed_load($fid) { static $feeds; if (!isset($feeds[$fid])) { - $feeds[$fid] = db_query('SELECT * FROM {aggregator_feed} WHERE fid = :fid', array(':fid' => $fid))->fetchAssoc(); + $feeds[$fid] = db_query('SELECT * FROM {aggregator_feed} WHERE fid = :fid', array(':fid' => $fid))->fetchObject(); } return $feeds[$fid]; @@ -980,6 +580,54 @@ function aggregator_category_load($cid) } /** + * Expire feed items on $feed that are older than aggregator_clear. + * + * @param $feed + * Array describing feed. + */ +function aggregator_expire($feed) { + // Remove all items that are older than flush item timer. + $age = REQUEST_TIME - variable_get('aggregator_clear', 9676800); + $iids = db_query('SELECT iid FROM {aggregator_item} WHERE fid = :fid AND timestamp < :timestamp', array(':fid' => $feed->fid, ':timestamp' => $age))->fetchCol(); + if ($iids) { + db_delete('aggregator_category_item') + ->condition('iid', $iids, 'IN') + ->execute(); + db_delete('aggregator_item') + ->condition('iid', $iids, 'IN') + ->execute(); + } +} + +/** + * Returns the enabled parser. + * + * @return + * A string that is the name of the module that implements the currently + * enabled parser. + */ +function aggregator_get_enabled_parser() { + return variable_get('aggregator_parser', ''); +} + +/** + * Returns enabled processors. + * + * @return + * An array of strings that are the names of the modules that implement the + * currently enabled processors. + */ +function aggregator_get_enabled_processors() { + $processors = variable_get('aggregator_processors', array()); + foreach ($processors as $k => $v) { + if ($v === 0) { + unset($processors[$k]); + } + } + return $processors; +} + +/** * Format an individual feed item for display in the block. * * @param $item @@ -993,9 +641,8 @@ function aggregator_category_load($cid) function theme_aggregator_block_item($item, $feed = 0) { // Display the external link to the item. - $output .= '' . check_plain($item->title) . "\n"; - - return $output; + return '' . check_plain($item->title) . "\n"; + } /** Index: modules/aggregator/aggregator.pages.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.pages.inc,v retrieving revision 1.22 diff -u -p -r1.22 aggregator.pages.inc --- modules/aggregator/aggregator.pages.inc 20 Oct 2008 12:57:35 -0000 1.22 +++ modules/aggregator/aggregator.pages.inc 15 Nov 2008 16:27:08 -0000 @@ -35,8 +35,7 @@ function aggregator_page_last() { function aggregator_page_source($arg1, $arg2 = NULL) { // If there are two arguments then this function is the categorize form, and // $arg1 is $form_state and $arg2 is $feed. Otherwise, $arg1 is $feed. - $feed = is_array($arg2) ? $arg2 : $arg1; - $feed = (object)$feed; + $feed = is_object($arg2) ? $arg2 : $arg1; drupal_set_title($feed->title); $feed_source = theme('aggregator_feed_source', $feed); Index: modules/aggregator/aggregator.parser.inc =================================================================== RCS file: modules/aggregator/aggregator.parser.inc diff -N modules/aggregator/aggregator.parser.inc --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ modules/aggregator/aggregator.parser.inc 15 Nov 2008 16:27:08 -0000 @@ -0,0 +1,372 @@ + t('Default Parser'), + 'description' => t('Default parser for RSS, Atom and RDF feeds.'), + ); +} + +/** + * Implementation of hook_aggregator_parse(). + */ +function aggregator_aggregator_parse($feed) { + global $channel, $image; + + // Generate conditional GET headers. + $headers = array(); + if ($feed->etag) { + $headers['If-None-Match'] = $feed->etag; + } + if ($feed->modified) { + $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed->modified) . ' GMT'; + } + + // Request feed. + $result = drupal_http_request($feed->url, $headers); + + // Process HTTP response code. + switch ($result->code) { + case 304: + db_update('aggregator_feed') + ->fields(array('checked' => REQUEST_TIME)) + ->condition('fid', $feed->fid) + ->execute(); + drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed->title))); + break; + case 301: + $feed->url = $result->redirect_url; + // Do not break here. + case 200: + case 302: + case 307: + // We store the md5 hash of feed data in the database. When refreshing a + // feed we compare stored hash and new hash calculated from downloaded + // data. If both are equal we say that feed is not updated. + $md5 = md5($result->data); + if ($feed->hash == $md5) { + db_update('aggregator_feed') + ->condition('fid', $feed->fid) + ->fields(array('checked' => REQUEST_TIME)) + ->execute(); + drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed->title))); + break; + } + + // Filter the input data. + if (aggregator_parse_feed($result->data, $feed)) { + $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']); + + // Prepare the channel data. + foreach ($channel as $key => $value) { + $channel[$key] = trim($value); + } + + // Prepare the image data (if any). + foreach ($image as $key => $value) { + $image[$key] = trim($value); + } + + if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) { + $image = l(theme('image', $image['URL'], $image['TITLE']), $image['LINK'], array('html' => TRUE)); + } + else { + $image = ''; + } + + $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag']; + // Update the feed data. + db_merge('aggregator_feed') + ->key(array('fid' => $feed->fid)) + ->fields(array( + 'url' => $feed->url, + 'checked' => REQUEST_TIME, + 'link' => !empty($channel['LINK']) ? $channel['LINK'] : '', + 'description' => !empty($channel['DESCRIPTION']) ? $channel['DESCRIPTION'] : '', + 'image' => $image, + 'hash' => $md5, + 'etag' => $etag, + 'modified' => $modified, + )) + ->execute(); + + // Clear the cache. + cache_clear_all(); + + if (isset($result->redirect_url)) { + watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed->title, '%url' => $feed->url)); + } + + watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed->title)); + drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed->title))); + } + break; + default: + watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed->title, '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING); + drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed->title, '%error' => $result->code . ' ' . $result->error))); + module_invoke('system', 'check_http_request'); + } +} + +/** + * Parse a feed and store its items. + * + * @param $data + * The feed data. + * @param $feed + * An object describing the feed to be parsed. + * @return + * FALSE on error, TRUE otherwise. + */ +function aggregator_parse_feed(&$data, $feed) { + global $items, $image, $channel; + + // Unset the global variables before we use them. + unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); + $items = array(); + $image = array(); + $channel = array(); + + // Parse the data. + $xml_parser = drupal_xml_parser_create($data); + xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); + xml_set_character_data_handler($xml_parser, 'aggregator_element_data'); + + if (!xml_parse($xml_parser, $data, 1)) { + watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING); + drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error'); + return FALSE; + } + xml_parser_free($xml_parser); + + // We reverse the array such that we store the first item last, and the last + // item first. In the database, the newest item should be at the top. + $items = array_reverse($items); + + // Initialize items array. + $feed->items = array(); + foreach ($items as $item) { + + // Prepare the item: + foreach ($item as $key => $value) { + $item[$key] = trim($value); + } + + // Resolve the item's title. If no title is found, we use up to 40 + // characters of the description ending at a word boundary, but not + // splitting potential entities. + if (!empty($item['TITLE'])) { + $item['TITLE'] = $item['TITLE']; + } + elseif (!empty($item['DESCRIPTION'])) { + $item['TITLE'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40)); + } + else { + $item['TITLE'] = ''; + } + + // Resolve the items link. + if (!empty($item['LINK'])) { + $item['LINK'] = $item['LINK']; + } + else { + $item['LINK'] = $feed->link; + } + $item['GUID'] = isset($item['GUID']) ? $item['GUID'] : ''; + + // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag. + if (!empty($item['CONTENT:ENCODED'])) { + $item['DESCRIPTION'] = $item['CONTENT:ENCODED']; + } + elseif (!empty($item['SUMMARY'])) { + $item['DESCRIPTION'] = $item['SUMMARY']; + } + elseif (!empty($item['CONTENT'])) { + $item['DESCRIPTION'] = $item['CONTENT']; + } + + // Try to resolve and parse the item's publication date. + $date = ''; + foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) { + if (!empty($item[$key])) { + $date = $item[$key]; + break; + } + } + + $item['TIMESTAMP'] = strtotime($date); + + if ($item['TIMESTAMP'] === FALSE) { + $item['TIMESTAMP'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure. + } + + $item += array('AUTHOR' => '', 'DESCRIPTION' => ''); + + // Store on $feed object. This is where processors will look for parsed items. + $feed->items[] = $item; + } + + return TRUE; +} + +/** + * Callback function used by the XML parser. + */ +function aggregator_element_start($parser, $name, $attributes) { + global $item, $element, $tag, $items, $channel; + + switch ($name) { + case 'IMAGE': + case 'TEXTINPUT': + case 'CONTENT': + case 'SUMMARY': + case 'TAGLINE': + case 'SUBTITLE': + case 'LOGO': + case 'INFO': + $element = $name; + break; + case 'ID': + if ($element != 'ITEM') { + $element = $name; + } + case 'LINK': + if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') { + if ($element == 'ITEM') { + $items[$item]['LINK'] = $attributes['HREF']; + } + else { + $channel['LINK'] = $attributes['HREF']; + } + } + break; + case 'ITEM': + $element = $name; + $item += 1; + break; + case 'ENTRY': + $element = 'ITEM'; + $item += 1; + break; + } + + $tag = $name; +} + +/** + * Call-back function used by the XML parser. + */ +function aggregator_element_end($parser, $name) { + global $element; + + switch ($name) { + case 'IMAGE': + case 'TEXTINPUT': + case 'ITEM': + case 'ENTRY': + case 'CONTENT': + case 'INFO': + $element = ''; + break; + case 'ID': + if ($element == 'ID') { + $element = ''; + } + } +} + +/** + * Callback function used by the XML parser. + */ +function aggregator_element_data($parser, $data) { + global $channel, $element, $items, $item, $image, $tag; + $items += array($item => array()); + switch ($element) { + case 'ITEM': + $items[$item] += array($tag => ''); + $items[$item][$tag] .= $data; + break; + case 'IMAGE': + case 'LOGO': + $image += array($tag => ''); + $image[$tag] .= $data; + break; + case 'LINK': + if ($data) { + $items[$item] += array($tag => ''); + $items[$item][$tag] .= $data; + } + break; + case 'CONTENT': + $items[$item] += array('CONTENT' => ''); + $items[$item]['CONTENT'] .= $data; + break; + case 'SUMMARY': + $items[$item] += array('SUMMARY' => ''); + $items[$item]['SUMMARY'] .= $data; + break; + case 'TAGLINE': + case 'SUBTITLE': + $channel += array('DESCRIPTION' => ''); + $channel['DESCRIPTION'] .= $data; + break; + case 'INFO': + case 'ID': + case 'TEXTINPUT': + // The sub-element is not supported. However, we must recognize + // it or its contents will end up in the item array. + break; + default: + $channel += array($tag => ''); + $channel[$tag] .= $data; + } +} + +/** + * Parse the W3C date/time format, a subset of ISO 8601. + * + * PHP date parsing functions do not handle this format. + * See http://www.w3.org/TR/NOTE-datetime for more information. + * Originally from MagpieRSS (http://magpierss.sourceforge.net/). + * + * @param $date_str + * A string with a potentially W3C DTF date. + * @return + * A timestamp if parsed successfully or FALSE if not. + */ +function aggregator_parse_w3cdtf($date_str) { + if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) { + list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]); + // Calculate the epoch for current date assuming GMT. + $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year); + if ($match[10] != 'Z') { // Z is zulu time, aka GMT + list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]); + // Zero out the variables. + if (!$tz_hour) { + $tz_hour = 0; + } + if (!$tz_min) { + $tz_min = 0; + } + $offset_secs = (($tz_hour * 60) + $tz_min) * 60; + // Is timezone ahead of GMT? If yes, subtract offset. + if ($tz_mod == '+') { + $offset_secs *= -1; + } + $epoch += $offset_secs; + } + return $epoch; + } + else { + return FALSE; + } +} Index: modules/aggregator/aggregator.processor.inc =================================================================== RCS file: modules/aggregator/aggregator.processor.inc diff -N modules/aggregator/aggregator.processor.inc --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ modules/aggregator/aggregator.processor.inc 15 Nov 2008 16:27:08 -0000 @@ -0,0 +1,87 @@ + t('Default processor'), + 'description' => t('Creates lightweight records of feed items.'), + ); +} + +/** + * Implementation of hook_aggregator_process(). + */ +function aggregator_aggregator_process($feed) { + if (is_object($feed)) { + if (is_array($feed->items)) { + foreach ($feed->items as $item) { + // Save this item. Try to avoid duplicate entries as much as possible. If + // we find a duplicate entry, we resolve it and pass along its ID is such + // that we can update it if needed. + if (!empty($item['GUID'])) { + $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND guid = :guid", array(':fid' => $feed->fid, ':guid' => $item['GUID']))->fetchObject(); + } + elseif ($item['LINK'] && $item['LINK'] != $feed->link && $item['LINK'] != $feed->url) { + $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND link = :link", array(':fid' => $feed->fid, ':link' => $item['LINK']))->fetchObject(); + } + else { + $entry = db_query("SELECT iid, timestamp FROM {aggregator_item} WHERE fid = :fid AND title = :title", array(':fid' => $feed->fid, ':title' => $item['TITLE']))->fetchObject(); + } + if (!$item['TIMESTAMP']) { + $item['TIMESTAMP'] = isset($entry->timestamp) ? $entry->timestamp : REQUEST_TIME; + } + aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid : ''), 'fid' => $feed->fid, 'timestamp' => $item['TIMESTAMP'], 'title' => $item['TITLE'], 'link' => $item['LINK'], 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $item['GUID'])); + } + } + } +} + +/** + * Add/edit/delete an aggregator item. + * + * @param $edit + * An associative array describing the item to be added/edited/deleted. + */ +function aggregator_save_item($edit) { + if ($edit['title'] && empty($edit['iid'])) { + $edit['iid'] = db_insert('aggregator_item') + ->fields(array( + 'title' => $edit['title'], + 'link' => $edit['link'], + 'author' => $edit['author'], + 'description' => $edit['description'], + 'guid' => $edit['guid'], + 'timestamp' => $edit['timestamp'], + 'fid' => $edit['fid'], + )) + ->execute(); + } + if ($edit['iid'] && !$edit['title']) { + db_delete('aggregator_item') + ->condition('iid', $edit['iid']) + ->execute(); + db_delete('aggregator_category_item') + ->condition('iid', $edit['iid']) + ->execute(); + } + elseif ($edit['title'] && $edit['link']) { + // file the items in the categories indicated by the feed + $result = db_query('SELECT cid FROM {aggregator_category_feed} WHERE fid = :fid', array(':fid' => $edit['fid'])); + foreach ($result as $category) { + db_merge('aggregator_category_item') + ->fields(array( + 'cid' => $category->cid, + 'iid' => $edit['iid'], + )) + ->execute(); + } + } +}