diff --git a/includes/locale.inc b/includes/locale.inc index 9714319..f0d2a7c 100644 --- a/includes/locale.inc +++ b/includes/locale.inc @@ -93,39 +93,79 @@ function locale_language_from_interface() { * otherwise we would cache a user-specific preference. * * @param $languages - * An array of valid language objects. + * An array of language objects for enabled languages ordered by weight. * * @return * A valid language code on success, FALSE otherwise. */ function locale_language_from_browser($languages) { - // Specified by the user via the browser's Accept Language setting + if (empty($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { + return FALSE; + } + + // The Accept-Language header contains information about the language + // preferences configured in the user's browser / operating system. + // RFC 2616 (section 14.4) defines the Accept-Language header as follows: + // Accept-Language = "Accept-Language" ":" + // 1#( language-range [ ";" "q" "=" qvalue ] ) + // language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" ) // Samples: "hu, en-us;q=0.66, en;q=0.33", "hu,en-us;q=0.5" - $browser_langs = array(); - - if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { - $browser_accept = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']); - foreach ($browser_accept as $langpart) { - // The language part is either a code or a code with a quality. - // We cannot do anything with a * code, so it is skipped. - // If the quality is missing, it is assumed to be 1 according to the RFC. - if (preg_match("!([a-z-]+)(;q=([0-9\\.]+))?!", trim($langpart), $found)) { - $browser_langs[$found[1]] = (isset($found[3]) ? (float) $found[3] : 1.0); - } + $browser_langcodes = array(); + if (preg_match_all('@([a-zA-Z-]+|\*)(?:;q=([0-9.]+))?(?:$|\s*,\s*)@', trim($_SERVER['HTTP_ACCEPT_LANGUAGE']), $matches, PREG_SET_ORDER)) { + foreach ($matches as $match) { + // We can safely use strtolower() here, tags are ASCII. + // RFC2616 mandates that the decimal part is no more than three digits, + // so we multiply the qvalue by 1000 to avoid floating point comparisons. + $langcode = strtolower($match[1]); + $qvalue = isset($match[2]) ? (float) $match[2] : 1; + $browser_langcodes[$langcode] = (int) ($qvalue * 1000); } } - // Order the codes by quality - arsort($browser_langs); + // Some browsers (especially some versions of Internet Explorer) sometimes + // send a specific language tag (fr-CA) without the corresponding generic + // tag (fr). In that case, we assume that the highest value of the specific + // tags is the value of the generic language. + arsort($browser_langcodes); + foreach ($browser_langcodes as $langcode => $qvalue) { + $generic_tag = strtok($langcode, '-'); + if (!isset($browser_langcodes[$generic_tag])) { + $browser_langcodes[$generic_tag] = $qvalue; + } + } - // Try to find the first preferred language we have - foreach ($browser_langs as $langcode => $q) { - if (isset($languages[$langcode])) { - return $langcode; + // Find the enabled language with the greatest qvalue, following the rules + // of RFC 2616 (section 14.4). If several languages have the same qvalue, + // prefer the one with the greatest weight. + $best_match_langcode = FALSE; + $max_qvalue = 0; + foreach ($languages as $langcode => $language) { + // Language tags are case insensitive (RFC2616, sec 3.10). + $langcode = strtolower($langcode); + + // If nothing matches below, the default qvalue is the one of the wildcard + // language, if set, or is 0 (which will never match). + $qvalue = isset($browser_langcodes['*']) ? $browser_langcodes['*'] : 0; + + // Find the longest possible prefix of the browser-supplied language + // ('the language-range') that matches this site language ('the language tag'). + $prefix = $langcode; + do { + if (isset($browser_langcodes[$prefix])) { + $qvalue = $browser_langcodes[$prefix]; + break; + } + } + while ($prefix = substr($prefix, 0, strrpos($prefix, '-'))); + + // Find the best match. + if ($qvalue > $max_qvalue) { + $best_match_langcode = $language->language; + $max_qvalue = $qvalue; } } - return FALSE; + return $best_match_langcode; } /** diff --git a/modules/locale/locale.test b/modules/locale/locale.test index 31556cb..bb1c47f 100644 --- a/modules/locale/locale.test +++ b/modules/locale/locale.test @@ -1297,6 +1297,125 @@ class LocaleLanguageSwitchingFunctionalTest extends DrupalWebTestCase { } /** + * Test browser language detection. + */ +class LocaleBrowserDetectionTest extends DrupalUnitTestCase { + + public static function getInfo() { + return array( + 'name' => 'Browser language detection', + 'description' => 'Tests for the browser language detection.', + 'group' => 'Locale', + ); + } + + /** + * Unit tests for the locale_language_from_browser() function. + */ + function testLanguageFromBrowser() { + // Load the required functions. + require_once DRUPAL_ROOT . '/includes/locale.inc'; + + $languages = array( + // In our test case, 'en' has priority over 'en-US'. + 'en' => (object) array( + 'language' => 'en', + ), + 'en-US' => (object) array( + 'language' => 'en-US', + ), + // But 'fr-CA' has priority over 'fr'. + 'fr-CA' => (object) array( + 'language' => 'fr-CA', + ), + 'fr' => (object) array( + 'language' => 'fr', + ), + // 'es-MX' is alone. + 'es-MX' => (object) array( + 'language' => 'es-MX', + ), + // 'pt' is alone. + 'pt' => (object) array( + 'language' => 'pt', + ), + // Language codes with more then one dash are actually valid. + // eh-oh-laa-laa is the official language code of the Teletubbies. + 'eh-oh-laa-laa' => (object) array( + 'language' => 'eh-oh-laa-laa', + ), + ); + + $test_cases = array( + // Equal qvalue for each language, choose the site prefered one. + 'en,en-US,fr-CA,fr,es-MX' => 'en', + 'en-US,en,fr-CA,fr,es-MX' => 'en', + 'fr,en' => 'en', + 'en,fr' => 'en', + 'en-US,fr' => 'en', + 'fr,en-US' => 'en', + 'fr,fr-CA' => 'fr-CA', + 'fr-CA,fr' => 'fr-CA', + 'fr' => 'fr-CA', + 'fr;q=1' => 'fr-CA', + 'fr,es-MX' => 'fr-CA', + 'fr,es' => 'fr-CA', + 'es,fr' => 'fr-CA', + 'es-MX,de' => 'es-MX', + 'de,es-MX' => 'es-MX', + + // Different cases and whitespace. + 'en' => 'en', + 'En' => 'en', + 'EN' => 'en', + ' en' => 'en', + 'en ' => 'en', + + // A less specific language from the browser matches a more specific one + // from the website, and the other way around for compatibility with + // some versions of Internet Explorer. + 'es' => 'es-MX', + 'es-MX' => 'es-MX', + 'pt' => 'pt', + 'pt-PT' => 'pt', + 'pt-PT;q=0.5,pt-BR;q=1,en;q=0.7' => 'pt', + 'pt-PT;q=1,pt-BR;q=0.5,en;q=0.7' => 'pt', + 'pt-PT;q=0.4,pt-BR;q=0.1,en;q=0.7' => 'en', + 'pt-PT;q=0.1,pt-BR;q=0.4,en;q=0.7' => 'en', + + // Language code with several dashes are valid. The less specific language + // from the browser matches the more specific one from the website. + 'eh-oh-laa-laa' => 'eh-oh-laa-laa', + 'eh-oh-laa' => 'eh-oh-laa-laa', + 'eh-oh' => 'eh-oh-laa-laa', + 'eh' => 'eh-oh-laa-laa', + + // Different qvalues. + 'en-US,en;q=0.5,fr;q=0.25' => 'en-US', + 'fr,en;q=0.5' => 'fr-CA', + 'fr,en;q=0.5,fr-CA;q=0.25' => 'fr', + + // Silly wildcards are also valid. + '*,fr-CA;q=0.5' => 'en', + '*,en;q=0.25' => 'fr-CA', + 'en,en-US;q=0.5,fr;q=0.25' => 'en', + 'en-US,en;q=0.5,fr;q=0.25' => 'en-US', + + // Unresolvable cases. + '' => FALSE, + 'de,pl' => FALSE, + $this->randomName(10) => FALSE, + ); + + foreach ($test_cases as $accept_language => $expected_result) { + $_SERVER['HTTP_ACCEPT_LANGUAGE'] = $accept_language; + $result = locale_language_from_browser($languages); + $this->assertIdentical($result, $expected_result, t("Language selection '@accept-language' selects '@result', result = '@actual'", array('@accept-language' => $accept_language, '@result' => $expected_result, '@actual' => isset($result) ? $result : 'none'))); + } + } +} + +/** * Functional tests for a user's ability to change their default language. */ class LocaleUserLanguageFunctionalTest extends DrupalWebTestCase {