#221712 by Damien Tournoud: fix browser language detection. From: Damien Tournoud --- locale.inc | 62 +++++++++++++++++++++-------- locale/locale.test | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 20 deletions(-) diff --git includes/locale.inc includes/locale.inc index ca031c9..71755ff 100644 --- includes/locale.inc +++ includes/locale.inc @@ -64,33 +64,59 @@ function locale_language_from_interface() { * A valid language code on success, FALSE otherwise. */ function locale_language_from_browser($languages) { - // Specified by the user via the browser's Accept Language setting + if (!isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { + return; + } + + // RFC 2616 (section 14.4) defines the Accept-Language header as followed: + // Accept-Language = "Accept-Language" ":" + // 1#( language-range [ ";" "q" "=" qvalue ] ) + // language-range = ( ( 1*8ALPHA *( "-" 1*8ALPHA ) ) | "*" ) // Samples: "hu, en-us;q=0.66, en;q=0.33", "hu,en-us;q=0.5" $browser_langs = array(); - - if (isset($_SERVER['HTTP_ACCEPT_LANGUAGE'])) { - $browser_accept = explode(",", $_SERVER['HTTP_ACCEPT_LANGUAGE']); - foreach ($browser_accept as $langpart) { - // The language part is either a code or a code with a quality. - // We cannot do anything with a * code, so it is skipped. - // If the quality is missing, it is assumed to be 1 according to the RFC. - if (preg_match("!([a-z-]+)(;q=([0-9\\.]+))?!", trim($langpart), $found)) { - $browser_langs[$found[1]] = (isset($found[3]) ? (float) $found[3] : 1.0); - } + if (preg_match_all('@([a-zA-Z-]+|\*)(?:;q=([0-9.]+))?(?:$|\s*,\s*)@', $_SERVER['HTTP_ACCEPT_LANGUAGE'], $matches, PREG_SET_ORDER)) { + foreach ($matches as $match) { + // We can safely use strtolower() here, tags are ASCII. + // RFC2616 mandates that the decimal part is no more than three digits, + // so we multiply the qvalue by 1000 to avoid floating point comparisons. + $langcode = strtolower($match[1]); + $qvalue = isset($match[2]) ? (float) $match[2] : 1; + $browser_langs[$langcode] = (int) ($qvalue * 1000); } } - // Order the codes by quality - arsort($browser_langs); + // Find the enabled language with the greatest qvalue, following the rules + // of RFC 2616 (section 14.4). If several languages have the same qvalue, + // prefer the one with the greatest weight. + $best_match = NULL; + $max_qvalue = 0; + foreach ($languages as $langcode => $language) { + // Language tags are case insensitive (RFC2616, sec 3.10). + $langcode = strtolower($langcode); + + // If nothing matches below, the default qvalue is the one of the wildcard + // language, if set, or is 0 (which will never match). + $qvalue = isset($browser_langs['*']) ? $browser_langs['*'] : 0; - // Try to find the first preferred language we have - foreach ($browser_langs as $langcode => $q) { - if (isset($languages[$langcode])) { - return $langcode; + // Find the longest possible prefix of the browser-supplied language + // ('the language-range') that matches this site language ('the language tag'). + $prefix = $langcode; + do { + if (isset($browser_langs[$prefix])) { + $qvalue = $browser_langs[$prefix]; + break; + } + } + while ($prefix = substr($prefix, 0, strrpos($prefix, '-'))); + + // Find the best match. + if ($qvalue > $max_qvalue) { + $best_match = $language->language; + $max_qvalue = $qvalue; } } - return FALSE; + return $best_match; } /** diff --git modules/locale/locale.test modules/locale/locale.test index 44966c3..f3c30d9 100644 --- modules/locale/locale.test +++ modules/locale/locale.test @@ -1104,9 +1104,9 @@ class LocaleUninstallFrenchFunctionalTest extends LocaleUninstallFunctionalTest /** - * Functional tests for the language switching feature. + * Tests for the language switching feature. */ -class LanguageSwitchingFunctionalTest extends DrupalWebTestCase { +class LanguageSwitchingTest extends DrupalWebTestCase { public static function getInfo() { return array( @@ -1180,6 +1180,113 @@ class LanguageSwitchingFunctionalTest extends DrupalWebTestCase { $this->assertIdentical($links, array('active' => array('en'), 'inactive' => array('fr')), t('Only the current language list item is marked as active on the language switcher block.')); $this->assertIdentical($anchors, array('active' => array('en'), 'inactive' => array('fr')), t('Only the current language anchor is marked as active on the language switcher block.')); } + + /** + * Unit tests for the language_from_browser() function. + */ + function testLanguageFromBrowser() { + // Load the required functions. + require_once DRUPAL_ROOT . '/includes/locale.inc'; + + $languages = array( + // In our test case, 'en' has priority over 'en-US'. + 'en' => (object) array( + 'language' => 'en', + 'enabled' => 1, + 'weight' => 10, + ), + 'en-US' => (object) array( + 'language' => 'en-US', + 'enabled' => 1, + 'weight' => 6, + ), + // But 'fr-CA' has priority over 'fr'. + 'fr-CA' => (object) array( + 'language' => 'fr-CA', + 'enabled' => 1, + 'weight' => 5, + ), + 'fr' => (object) array( + 'language' => 'fr', + 'enabled' => 1, + 'weight' => 4, + ), + // 'es-MX' is alone. + 'es-MX' => (object) array( + 'language' => 'es-MX', + 'enabled' => 1, + 'weight' => 3, + ), + // 'pt' is alone. + 'pt' => (object) array( + 'language' => 'pt', + 'enabled' => 1, + 'weight' => 2, + ), + // Language codes with more then one dash are actually valid. + // eh-oh-laa-laa is the official language code of the Teletubbies. + 'eh-oh-laa-laa' => (object) array( + 'language' => 'eh-oh-laa-laa', + 'enabled' => 1, + 'weight' => 1, + ), + ); + + $test_cases = array( + // Equal qvalue for each language, choose the site prefered one. + 'en,en-US,fr-CA,fr,es-MX' => 'en', + 'en-US,en,fr-CA,fr,es-MX' => 'en', + 'fr,en' => 'en', + 'en,fr' => 'en', + 'en-US,fr' => 'en-US', + 'fr,en-US' => 'en-US', + 'fr,fr-CA' => 'fr-CA', + 'fr-CA,fr' => 'fr-CA', + 'fr' => 'fr-CA', + 'fr;q=1' => 'fr-CA', + 'fr,es-MX' => 'fr-CA', + 'fr,es' => 'fr-CA', + 'es,fr' => 'fr-CA', + 'es-MX,de' => 'es-MX', + 'de,es-MX' => 'es-MX', + + // A less specific language from the browser matches a more specific one + // from the website, but not the other way around. + 'es' => 'es-MX', + 'es-MX' => 'es-MX', + 'pt' => 'pt', + 'pt-PT' => NULL, + + // Language code with several dashes are valid. The less specific language + // from the browser matches the more specific one from the website. + 'eh-oh-laa-laa' => 'eh-oh-laa-laa', + 'eh-oh-laa' => 'eh-oh-laa-laa', + 'eh-oh' => 'eh-oh-laa-laa', + 'eh' => 'eh-oh-laa-laa', + + // Different qvalues. + 'en-US,en;q=0.5,fr;q=0.25' => 'en-US', + 'fr,en;q=0.5' => 'fr-CA', + 'fr,en;q=0.5,fr-CA;q=0.25' => 'fr', + + // Silly wildcards are also valid. + '*,fr-CA;q=0.5' => 'en', + '*,en;q=0.25' => 'fr-CA', + 'en,en-US;q=0.5,fr;q=0.25' => 'en', + 'en-US,en;q=0.5,fr;q=0.25' => 'en-US', + + // Unresolvable cases. + '' => NULL, + 'de,pl' => NULL, + $this->randomName(10) => NULL, + ); + + foreach ($test_cases as $accept_language => $expected_result) { + $_SERVER['HTTP_ACCEPT_LANGUAGE'] = $accept_language; + $result = locale_language_from_browser($languages); + $this->assertIdentical($result, $expected_result, t("Language selection '@accept-language' selects '@result', result = '@actual'", array('@accept-language' => $accept_language, '@result' => $expected_result, '@actual' => isset($result) ? $result : 'none'))); + } + } } /**