diff --git a/includes/processor_html_filter.inc b/includes/processor_html_filter.inc index 0cc4800d..7e824998 100644 --- a/includes/processor_html_filter.inc +++ b/includes/processor_html_filter.inc @@ -102,8 +102,11 @@ class SearchApiHtmlFilter extends SearchApiAbstractProcessor { } else { $value = html_entity_decode(strip_tags($text)); - // Remove any multiple or leading/trailing spaces we might have introduced. - $value = preg_replace('/\s\s+/', ' ', trim($value)); + // Remove all leading/trailing unicode spaces because trim + // doesn't know the unicode ones. + $value = preg_replace('/^\s+|\s+$/u', '', $value); + // Replace all multiple unicode spaces with one. + $value = preg_replace('/\s{2,}/u', ' ', $value); } } @@ -112,8 +115,11 @@ class SearchApiHtmlFilter extends SearchApiAbstractProcessor { while (($pos = strpos($text, '<')) !== FALSE) { if ($boost && $pos > 0) { $token = html_entity_decode(substr($text, 0, $pos), ENT_QUOTES, 'UTF-8'); - // Remove any multiple or leading/trailing spaces we might have introduced. - $token = preg_replace('/\s\s+/', ' ', trim($token)); + // Remove all leading/trailing unicode spaces because trim + // doesn't know the unicode ones. + $token = preg_replace('/^\s+|\s+$/u', '', $token); + // Replace all multiple unicode spaces with one. + $token = preg_replace('/\s{2,}/u', ' ', $token); $ret[] = array( 'value' => $token, 'score' => $boost, @@ -138,8 +144,11 @@ class SearchApiHtmlFilter extends SearchApiAbstractProcessor { } if ($text) { $token = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); - // Remove any multiple or leading/trailing spaces we might have introduced. - $token = preg_replace('/\s\s+/', ' ', trim($token)); + // Remove all leading/trailing unicode spaces because trim + // doesn't know the unicode ones. + $token = preg_replace('/^\s+|\s+$/u', '', $token); + // Replace all multiple unicode spaces with one. + $token = preg_replace('/\s{2,}/u', ' ', $token); $ret[] = array( 'value' => $token, 'score' => $boost,