cdn.admin.inc | 29 +++++++++++++++++++ cdn.constants.inc | 8 +++++ cdn.module | 74 +++++++++++++++++++++++++++++++++++++++++++++++ help/admin-other-seo.html | 10 +++++++ help/cdn.help.ini | 11 +++++-- tests/cdn.test | 65 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 194 insertions(+), 3 deletions(-) diff --git a/cdn.admin.inc b/cdn.admin.inc index 0731267..84a1c05 100644 --- a/cdn.admin.inc +++ b/cdn.admin.inc @@ -241,6 +241,35 @@ function cdn_admin_other_settings_form($form, &$form_state) { ); } + $form['cdn_seo'] = array( + '#type' => 'fieldset', + '#title' => t('SEO: duplicate content prevention'), + '#description' => t( + "By default most CDNs will cache full HTML pages if accessed. This means + that a copy of your site may appear in search engines. This is confusing, unprofessional and potentially bad for SEO. +
+ Duplicate content prevention is enabled by default and ensures the CDN will redirect users to your actual ('canonical') site." + ) + ); + $form['cdn_seo'][CDN_SEO_REDIRECT_VARIABLE] = array( + '#type' => 'checkbox', + '#title' => t('Enable duplicate content prevention'), + '#default_value' => variable_get(CDN_SEO_REDIRECT_VARIABLE, CDN_SEO_REDIRECT_DEFAULT), + ); + $form['cdn_seo'][CDN_SEO_USER_AGENTS_VARIABLE] = array( + '#type' => 'textarea', + '#title' => t('CDN user agents'), + '#description' => t( + 'A case-insensitive list of CDN user agents. These will be substring-matched against the detected user agent of a request. One per line.' + ), + '#default_value' => variable_get(CDN_SEO_USER_AGENTS_VARIABLE, CDN_SEO_USER_AGENTS_DEFAULT), + '#states' => array( + 'visible' => array( + ':input[name="' . CDN_SEO_REDIRECT_VARIABLE . '"]' => array('checked' => TRUE), + ) + ), + ); + $form['https'] = array( '#type' => 'fieldset', '#title' => t('HTTPS'), diff --git a/cdn.constants.inc b/cdn.constants.inc index e8d70b4..bd48999 100644 --- a/cdn.constants.inc +++ b/cdn.constants.inc @@ -35,6 +35,14 @@ define('CDN_EXCEPTION_DRUPAL_PATH_BLACKLIST_DEFAULT', ''); define('CDN_EXCEPTION_AUTH_USERS_BLACKLIST_VARIABLE', 'cdn_exception_auth_users_blacklist'); define('CDN_EXCEPTION_AUTH_USERS_BLACKLIST_DEFAULT', 'admin*'); +// Variables for SEO duplicate content prevention. +define('CDN_SEO_REDIRECT_VARIABLE', 'cdn_seo_redirect'); +define('CDN_SEO_REDIRECT_DEFAULT', TRUE); +define('CDN_SEO_USER_AGENTS_VARIABLE', 'cdn_seo_user_agents'); +define('CDN_SEO_USER_AGENTS_DEFAULT', "Amazon Cloudfront\nAkamai"); +define('CDN_SEO_FORBIDDEN_EXTENSIONS_VARIABLE', "cdn_seo_forbidden_extensions"); +define('CDN_SEO_FORBIDDEN_EXTENSIONS_DEFAULT', "html\nhtm\nphp"); + // Variables for basic mode. define('CDN_BASIC_MAPPING_VARIABLE', 'cdn_basic_mapping'); define('CDN_BASIC_MAPPING_HTTPS_VARIABLE', 'cdn_basic_mapping_https'); diff --git a/cdn.module b/cdn.module index 49087aa..b391df2 100644 --- a/cdn.module +++ b/cdn.module @@ -466,6 +466,38 @@ function cdn_html_head_alter(&$head_elements) { } /** + * Implements hook_boot(). + */ +function cdn_boot() { + // Bail if the status is "disabled" (i.e. don't when "enabled" or "testing"). + if (variable_get(CDN_STATUS_VARIABLE, CDN_DISABLED) === CDN_DISABLED) { + return; + } + + // Inspired by common.inc/_drupal_bootstrap_full(). + require_once DRUPAL_ROOT . '/includes/common.inc'; + require_once DRUPAL_ROOT . '/' . variable_get('path_inc', 'includes/path.inc'); + require_once DRUPAL_ROOT . '/includes/unicode.inc'; + + // Prevent the CDN from returning content pages. We only want the CDN to + // return static files like images, CSS files, JavaScript files, etc. By + // default it will return anything. Since those static files aren't served by + // Drupal. + $redirect_url = _cdn_seo_should_redirect(current_path()); + if ($redirect_url !== FALSE) { + // A 301 is SEO friendly, as it tells the search engine what the canonical + // URL is for this content. + header('HTTP/1.0 301 Moved Permanently'); + // @see http://googlewebmastercentral.blogspot.com/2011/06/supporting-relcanonical-http-headers.html + header('Link: <' . $redirect_url . '>; rel="canonical"'); + header('Location: ' . $redirect_url); + + // To ensure this redirect occurs immediately we don't use drupal_exit(). + exit(); + } +} + +/** * Implements hook_init(). */ function cdn_init() { @@ -850,3 +882,45 @@ function cdn_post_render_html_alter($html, $elements = array()) { function _cdn_ufi_deployment_id($path) { return CDN_DEPLOYMENT_ID; } + +/** + * Determines whether a redirect should be performed for the given path for SEO + * considerations (prevent duplicate HTML content on the CDN), and if so, the + * URL to which the requesting User Agent should be redirected. + * + * @param $path + * The path for which to determine the redirect URL. + * @return + * FALSE if no redirect should occur, or the URL to redirect to. + */ +function _cdn_seo_should_redirect($path) { + if (variable_get(CDN_SEO_REDIRECT_VARIABLE, CDN_SEO_REDIRECT_DEFAULT)) { + // If the path ends in an extension that is not in the list of forbidden + // extensions, then return FALSE to indicate that no redirect should occur. + // The rationale is: menu_get_item() doesn't allow us to detect whether a + // page callback will generate a file; hence we (ab)use this heuristic. + // An added benefit is that we don't need the menu system to be loaded, + // meaning that we can do all of this during hook_boot(), meaning we can use + // this same code for cached pages, which we need to support anyway. + // @todo: improve Drupal core so that contrib modules can know whether + // certain menu callbacks generate files or not. + $forbidden_extensions = variable_get(CDN_SEO_FORBIDDEN_EXTENSIONS_VARIABLE, CDN_SEO_FORBIDDEN_EXTENSIONS_DEFAULT); + $extension = drupal_strtolower(pathinfo($path, PATHINFO_EXTENSION)); + if (!empty($extension) && !in_array($extension, explode("\n", $forbidden_extensions))) { + return FALSE; + } + + // Use case-insensitive substring matching to match the current User-Agent + // to the list of CDN user agents. + if (isset($_SERVER['HTTP_USER_AGENT'])) { + $ua = drupal_strtolower($_SERVER['HTTP_USER_AGENT']); + $cdn_user_agents = explode("\n", drupal_strtolower(variable_get(CDN_SEO_USER_AGENTS_VARIABLE, CDN_SEO_USER_AGENTS_DEFAULT))); + foreach ($cdn_user_agents as $cdn_ua) { + if (strstr($ua, trim($cdn_ua))) { + return url($path, array('absolute' => TRUE)); + } + } + } + } + return FALSE; +} diff --git a/help/admin-other-seo.html b/help/admin-other-seo.html new file mode 100644 index 0000000..132ae7a --- /dev/null +++ b/help/admin-other-seo.html @@ -0,0 +1,10 @@ +

By default most CDNs will cache full HTML pages if accessed. This means that a copy of your site will start to appear and be indexed by search engines (in SEO terminology: "duplicate content"). This is duplicate content and can cause search engines to penalize your site in search results. A duplicate site is also confusing to the end-user and unprofessional.

+

If you enable CDN module's duplicate content prevention feature, the CDN module will ensure that all requests for (Drupal-served) HTML content to the CDN will redirect your end-users to the Drupal site itself.

+

Duplicate content prevention also works for cached pages.

+ +

Detailed information for the experts

+

Requests from the CDN are detected by the User-Agent request header. The default list of CDN user agents covers the Akamai and Amazon CloudFront CDNs (at the time of writing), and can be modified through CDN's admin UI. Case-insensitive, substring-based matching is used.

+

An edge case here is that of "Drupal-generated files", such as image styles. These are files that should be served from the CDN, but first have to be generated by Drupal. Unfortunately, there is not yet an API in Drupal to detect this programmatically. Hence, the only feasible solution was to use a heuristic: if the URL ends in .htm, .html or .php (some people prefer this over "clean URLs"; you could easily configure in the path or pathauto modules), or when the URL does not have a file extension at all, it is assumed to be HTML content, and the duplicate content prevention feature will be applied. Otherwise, it is assumed that the content is a file that should be served from the CDN.
+Currently, no UI is provided to override this, but you can programmatically override it, see CDN_SEO_FORBIDDEN_EXTENSIONS_DEFAULT.

+

The CDN will receive a HTTP/1.0 301 Moved Permanently response, with a Location header to perform the redirect and a Link: ; rel="canonical" header to indicate to search engines what the location of the canonical representation of the current web page is. The CDN user agent will not (should not) follow the redirect, and should instead simply cache this response and serve this response to any end-user trying to access it.

+

You can verify that it is working as intended by accessing a development version of your site with two different browsers, e.g. Google's Chrome and Mozilla's Firefox. Add e.g. "Firefox" to the list of CDN user agents, and then try to load any page (e.g. the front page) in Firefox: you should get an endless redirect loop and the browser should bail; while in Chrome everything should continue to work just fine.

diff --git a/help/cdn.help.ini b/help/cdn.help.ini index 74c39b8..a5aaf6a 100644 --- a/help/cdn.help.ini +++ b/help/cdn.help.ini @@ -56,17 +56,22 @@ weight = 2 title = "Admin: Other" weight = 2 +[admin-other-seo] +title = "SEO" +parent = admin-seo +weight = 1 + [admin-other-https] title = "HTTPS" parent = admin-other -weight = 1 +weight = 2 [admin-other-exceptions] title = "Exceptions" parent = admin-other -weight = 2 +weight = 3 [admin-other-cdn-pick-server] title = "cdn_pick_server()" parent = admin-other -weight = 3 +weight = 4 diff --git a/tests/cdn.test b/tests/cdn.test index 2cc9ef6..6ebd73a 100644 --- a/tests/cdn.test +++ b/tests/cdn.test @@ -14,6 +14,7 @@ class CDNUnitTestCase extends DrupalUnitTestCase { 'HTTP_ACCEPT_ENCODING', 'HTTPS' => 'off', 'HTTP_X_FORWARDED_PROTO' => 'http', + 'HTTP_USER_AGENT' => $this->randomName(), ); $alt_server = array_merge($alt_server, $_SERVER); $_SERVER = $alt_server; @@ -72,6 +73,7 @@ class CDNUnitTestCase extends DrupalUnitTestCase { CDN_BASIC_MAPPING_VARIABLE => '', CDN_BASIC_MAPPING_HTTPS_VARIABLE => '', CDN_BASIC_FARFUTURE_VARIABLE => FALSE, + CDN_SEO_REDIRECT_VARIABLE => TRUE, ); $conf = array_merge($conf, $this->defaultConfig); } @@ -94,6 +96,16 @@ class CDNUnitTestCase extends DrupalUnitTestCase { } /** + * Set the User-Agent of the current "request". + * + * @param $ua + * A User-Agent, which can be almost any string. + */ + function setUserAgent($ua) { + $_SERVER['HTTP_USER_AGENT'] = $ua; + } + + /** * Configure HTTPS-related settings. * * @param $supported @@ -593,3 +605,56 @@ class CDNCssUrlTestCase extends CDNWebTestCase { } } } + +class CDNSEOTestCase extends CDNUnitTestCase { + public static function getInfo() { + return array( + 'name' => 'SEO', + 'description' => 'Verify SEO duplicate content prevention.', + 'group' => 'CDN', + ); + } + + function testSEO() { + // Test with and without SEO duplicate content prevention. Each case is tested + // with three sorts of paths: + // - page path (without any extension and with all of the three default + // "allowed" extensions, meaning that they c) + // - file path + // - generated file path + + // SEO duplicate content prevention disabled. + $this->variableSet(CDN_SEO_REDIRECT_VARIABLE, FALSE); + $this->setUserAgent('Amazon CloudFront'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.htm.'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.htm'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.php'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('misc/jquery.js'), FALSE, 'Disabled SEO duplicate content prevention is respected for file paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('sites/default/files/styles/thumbnail/foobar.png'), FALSE, 'Disabled SEO duplicate content prevention is respected for generated file paths.'); + $this->setUserAgent($this->randomName()); + $this->assertIdentical(_cdn_seo_should_redirect('node/1'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.html'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.htm'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.php'), FALSE, 'Disabled SEO duplicate content prevention is respected for page paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('misc/jquery.js'), FALSE, 'Disabled SEO duplicate content prevention is respected for file paths.'); + $this->assertIdentical(_cdn_seo_should_redirect('sites/default/files/styles/thumbnail/foobar.png'), FALSE, 'Disabled SEO duplicate content prevention is respected for generated file paths.'); + + // SEO page request duplicate content prevention enabled. + $this->variableSet(CDN_SEO_REDIRECT_VARIABLE, TRUE); + $this->setUserAgent('The Amazon CloudFront User Agent!'); // Note that this is a superstring of the provided CDN user agent "Amazon CloudFront"! + $this->assertIdentical(_cdn_seo_should_redirect('node/1'), url('node/1', array('absolute' => TRUE)), 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.html'), url('node/1.html', array('absolute' => TRUE)), 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.htm'), url('node/1.htm', array('absolute' => TRUE)), 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.php'), url('node/1.php', array('absolute' => TRUE)), 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('misc/jquery.js'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a file path: the response is not a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('sites/default/files/styles/thumbnail/foobar.png'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a CDN UA requests a generated file path: the response is not a redirect.'); + $this->setUserAgent($this->randomName()); + $this->assertIdentical(_cdn_seo_should_redirect('node/1'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.htm'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.html'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('node/1.php'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a page path: the response is a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('misc/jquery.js'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a file path: the response is not a redirect.'); + $this->assertIdentical(_cdn_seo_should_redirect('sites/default/files/styles/thumbnail/foobar.png'), FALSE, 'Enabled SEO duplicate content prevention works correctly when a non-CDN UA requests a generated file path: the response is not a redirect.'); + } +}