diff --git a/INSTALL.txt b/INSTALL.txt index 9007ddd..6389864 100644 --- a/INSTALL.txt +++ b/INSTALL.txt @@ -1,16 +1,15 @@ - For general information about this module, check the README.txt file. Installation ------------ 1. Unzip the files, and upload them as a subdirectory of the -sites/all/modules directory of your Drupal installation (or the +modules/ directory of your Drupal installation (or the location you normally use for contributed modules). -2. Go to 'Administer > Modules', and enable the Porter Stemmer module. +2. Go to ‘Manage > Extend’, and enable the Porter Stemmer module. -3. Go to 'Administer > Configuration > Search and metadata > Search settings' +3. Go to ‘Manage > Configuration > Search pages’ and click 'Re-index site'. You should do this step whenever you upgrade to a new version of the Porter Stemmer module, so that the search index is rebuilt with any changes to the stemming algorithm. @@ -32,4 +31,4 @@ file to enable the module: If the PECL "stem" library is not available, the module uses a PHP implementation of the stemming algorithm. The output is identical. More information about the PECL "stem" library: - http://pecl.php.net/package/stem + http://pecl.php.net/package/stem \ No newline at end of file diff --git a/README.txt b/README.txt index 8cdc2f1..a8f1505 100644 --- a/README.txt +++ b/README.txt @@ -34,15 +34,17 @@ TESTING The Porter Stemmer module includes tests for the stemming algorithm and functionality. If you would like to run the tests, enable the core Testing -module, and then navigate to Administer > Configuration / Development / Testing. +module, and then run the tests following instructions on +https://www.drupal.org/docs/8/phpunit/running-phpunit-tests. Commands below: -Each "Stemming output" test for the Porter Stemmer module includes approximately -2000 individual word stemming tests (which test the module against a standard -word list downloaded from the site above). Due to the way output is displayed -in SimpleTest, you may run into browser timeout or memory issues if you try to -run all 16 of the "Stemming output" tests during the same test run. +cd core +../vendor/bin/phpunit --group porterstemmer + + +Each test for the Porter Stemmer module includes approximately +5000 individual word stemming tests (which test the module against a standard +word list). Tests are provided both for the internal algorithm and the PECL library. -There are also functional tests and tests for some of the internal steps of the -stemming algorithm. +There is also a functional test for integration with Drupal search. diff --git a/porterstemmer.info.yml b/porterstemmer.info.yml new file mode 100644 index 0000000..3737196 --- /dev/null +++ b/porterstemmer.info.yml @@ -0,0 +1,7 @@ +name: Porter-Stemmer +type: module +description: Improves American English language searching by simplifying related words to their root (conjugations, plurals, ...). +core: 8.x +package: Search +dependencies: + - search diff --git a/porterstemmer.module b/porterstemmer.module index 77e2c73..0a886af 100644 --- a/porterstemmer.module +++ b/porterstemmer.module @@ -2,123 +2,85 @@ /** * @file - * This is an implementation of the Porter 2 Stemming algorithm from - * http://snowball.tartarus.org/algorithms/english/stemmer.html - * by Jennifer Hodgdon of Poplar ProductivityWare, www.poplarware.com + * Contains porterstemmer.module. */ +use Drupal\Core\Routing\RouteMatchInterface; +use Drupal\porterstemmer\Porter2; + /** - * Implements hook_search_preprocess(). + * Regular expression defining a word boundary for Porter Stemmer. * - * Stems the words in $text, using the Porter Stemmer 2 algorithm. + * A word boundary is anything not a letter or an apostrophe. */ -function porterstemmer_search_preprocess($text) { - - // Convert text to lower case, and replace special apostrophes with regular - // apostrophes. - $text = drupal_strtolower(str_replace('’', "'", $text)); - - // Split into words - $words = preg_split('/(' . PORTERSTEMMER_BOUNDARY . '+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); - - if (!count( $words )) { - return $text; - } - - $has_pecl_stem = _porterstemmer_pecl_loaded(); - - // Process each word, skipping delimiters. - $isword = !preg_match('/' . PORTERSTEMMER_BOUNDARY . '/', $words[0] ); - foreach ($words as $k => $word) { - if ($isword) { - if ( $has_pecl_stem ) { - $words[$k] = stem_english($word); - } - else { - $words[$k] = porterstemmer_stem($word); - } - } - $isword = !$isword; - } - - // Put it all back together (note that delimiters are in $words). - return implode('', $words); -} +define('PORTERSTEMMER_BOUNDARY', "[^a-zA-Z']+"); /** * Implements hook_help(). */ -function porterstemmer_help($path, $arg) { - switch ($path) { - case 'admin/help#porterstemmer': +function porterstemmer_help($route_name, RouteMatchInterface $route_match) { + switch ($route_name) { + // Main module help for the porterstemmer module. + case 'help.page.porterstemmer': $output = ''; $output .= '

' . t('About') . '

'; - $output .= '

' . t('The Porter Stemmer module implements version 2 of the Porter Stemmer algorithm, to improve American English-language searching with the core Search module. Stemming reduces a word to its basic root or stem (e.g. "blogging" to "blog") so that variations on a word ("blogs", "blogged", "blogging", "blog") are considered equivalent when searching. This generally results in more relevant results.', array('@search-help' => url('admin/help/search'), '@algorithm' => 'http://snowball.tartarus.org/algorithms/english/stemmer.html')) . '

'; + $output .= '

' . t('Improves American English language searching by simplifying related words to their root (conjugations, plurals, ...).') . '

'; return $output; + + default: } } /** - * Implements hook_sbp_excerpt_match(). + * Implements hook_search_preprocess(). * - * Allows Porter Stemmer to display better search excerpts with the - * Search by page module. + * Stems the words in $text, using the Porter 2 (English) stemming algorithm. */ -function porterstemmer_sbp_excerpt_match($key, $text, $offset, $boundary) { - // Stem the keyword down to its root form. - $key = porterstemmer_stem($key); +function porterstemmer_search_preprocess($text, $langcode = NULL) { + // If the language is not set, get it from the language manager. + if (!isset($langcode)) { + $langcode = \Drupal::languageManager()->getCurrentLanguage()->getId(); + } - // In many cases, the root word is a substring of the full word, but not - // all. The cases where it is not, the root ends in e, i, or y, and if this - // last letter is removed, the root is a substring of the full word. - // So remove these letters at the end of the root. - $didit = FALSE; - porterstemmer_suffix($key, 'i', '', $didit, NULL, 2) OR - porterstemmer_suffix($key, 'e', '', $didit, NULL, 2) OR - porterstemmer_suffix($key, 'y', '', $didit, NULL, 2); + if ($langcode == 'en') { + // Convert text to lower case, and replace special apostrophes with regular + // apostrophes. + $text = strtolower(str_replace('’', "'", $text)); - // Look for this modified key at the start of a word. - $match = array(); - if (!preg_match('/' . $boundary . '(' . $key . ')/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { - // Didn't match our modified key. - return FALSE; - } + // Split into words. + $words = preg_split('/(' . PORTERSTEMMER_BOUNDARY . '+)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); - // If we get here, we have a potential match. Find the end of the word we - // actually matched, so it can be highlighted (making sure it's a real match - // for our key). - $newmatch = array(); - $pos = $match[1][1]; - // Note: Do not use drupal_strlen/drupal_substr here! Need the real PHP - // string lengths/pos. - if (preg_match('/' . $boundary . '/iu', $text, $newmatch, - PREG_OFFSET_CAPTURE, $pos + strlen($key))) { - $keyfound = substr($text, $pos, $newmatch[0][1] - $pos); - } - else { - // Assume we're going to the end of the string. - $keyfound = substr($text, $pos); - } + if (!count($words)) { + return $text; + } - $foundstem = porterstemmer_stem($keyfound); - porterstemmer_suffix($foundstem, 'i', '', $didit, NULL, 2) OR - porterstemmer_suffix($foundstem, 'e', '', $didit, NULL, 2) OR - porterstemmer_suffix($foundstem, 'y', '', $didit, NULL, 2); + $has_pecl_stem = _porterstemmer_pecl_loaded(); + + // Process each word, skipping delimiters. + $isword = !preg_match('/' . PORTERSTEMMER_BOUNDARY . '/', $words[0]); + foreach ($words as $k => $word) { + if ($isword) { + if ($has_pecl_stem) { + $words[$k] = stem_english($word); + } + else { + $words[$k] = Porter2::stem($word); + } + } + $isword = !$isword; + } - // Both $foundstem and $key may contain upper case. - if (drupal_strtolower($foundstem) == drupal_strtolower($key)) { - return array('where' => $pos, 'keyword' => $keyfound); + // Put it all back together (note that delimiters are in $words). + return implode('', $words); } - // If we get here, then it was a false match, and we should probably - // search again later in the string. - return porterstemmer_sbp_excerpt_match($key, $text, $pos + strlen($keyfound), $boundary); + return $text; } /** * Checks to see if the PECL stem extension has been loaded. * - * @return + * @return bool * TRUE if the stem_english() function from the PECL stem library can be * used, FALSE if not. */ @@ -126,7 +88,7 @@ function _porterstemmer_pecl_loaded() { static $has_pecl_stem = FALSE; static $already_checked = FALSE; - if ( $already_checked ) { + if ($already_checked) { return $has_pecl_stem; } @@ -134,623 +96,3 @@ function _porterstemmer_pecl_loaded() { $already_checked = TRUE; return $has_pecl_stem; } - -/** - * Regular expression defining a vowel for Porter Stemmer. - */ -define('PORTERSTEMMER_VOWEL', '[aeiouy]'); - -/** - * Regular expression defining not-a-vowel for Porter Stemmer. - */ -define('PORTERSTEMMER_NOT_VOWEL', '[^aeiouy]'); - -/** - * Regular expression defining not a vowel, w, x, or Y for Porter Stemmer. - */ -define('PORTERSTEMMER_NOT_VOWEL_WXY', '[^aeiouywxY]'); - -/** - * Regular expression defining a double consonant for Porter Stemmer. - */ -define('PORTERSTEMMER_DOUBLE', '(bb|dd|ff|gg|mm|nn|pp|rr|tt)'); - -/** - * Regular expression defining an li-ending for Porter Stemmer purposes. - */ -define('PORTERSTEMMER_LI_END', '[cdeghkmnrt]'); - -/** - * Regular expression defining a word boundary for Porter Stemmer. - * - * A word boundary is anything not a letter or an apostrophe. - */ -define('PORTERSTEMMER_BOUNDARY', "[^a-zA-Z']+"); - -/** - * Stems a word, using the Porter Stemmer 2 algorithm. - * - * @param $word - * Word to stem. - * @return - * Stemmed word - */ -function porterstemmer_stem($word) { - // Each of these helper functions returns TRUE if it is time to stop - // stemming and return. If everything is fine, they modify params by - // reference, as necessary, for the next function. - - $r1 = 0; // position of R1 region in original word - $r2 = 0; // position of R1 region in original word - porterstemmer_prestemming($word, $r1, $r2) OR - porterstemmer_exception1($word) OR - porterstemmer_step0($word) OR - porterstemmer_step1a($word) OR - porterstemmer_exception2($word) OR - porterstemmer_step1b($word, $r1) OR - porterstemmer_step1c($word) OR - porterstemmer_step2($word, $r1) OR - porterstemmer_step3($word, $r1, $r2) OR - porterstemmer_step4($word, $r2) OR - porterstemmer_step5($word, $r1, $r2); - - porterstemmer_poststemming( $word ); - return $word; -} - -/** - * Returns TRUE if word is too short to continue stemming. - */ -function porterstemmer_too_short($word, $reset = FALSE) { - static $min_chars = 0; - if ( !$min_chars || $reset ) { - // Get Search module's idea of minimum characters - $min_chars = intval( variable_get('minimum_word_size', 3)); - // Porter algorithm cannot handle less than 2 characters - if ( $min_chars < 2 ) { - $min_chars = 2; - } - } - - if ( drupal_strlen($word) < $min_chars ) { - return TRUE; - } - - return FALSE; -} - -/** - * Replaces word and calculates return value for steps. - * - * If $tmp is long enough, replaces $word with $tmp and returns FALSE - * to continue stemming process. If $tmp is too short, no replacement - * and returns TRUE to end stemming process. - */ -function porterstemmer_step_ending(&$word, $tmp) { - if ( porterstemmer_too_short( $tmp)) { - return TRUE; - } - - $word = $tmp; - return FALSE; -} - -/** - * Replaces one word ending with another, if tests pass. - * - * The return value is TRUE of the ending is present at the end - * of the word, and FALSE if the ending is not present. The found - * word ending is also replaced with the given replacement, only if - * the additional regular expression (if present) matches and if the - * word is at least the given length. - * - * @param $word - * Word to performm search/replace on. - * @param $oldend - * Ending to check for. - * @param $newend - * Replacement ending. - * @param $didit - * Set to TRUE in the case that a replacement is done; left alone - * otherwise. - * @param $other - * Extra regular expression; must match to allow ending replacement. - * @param $minlen - * Minimum word length required to allow ending replacement. For - * instance, to see if a particular ending is in the R1 region, - * pass in $r1 + length of ending as the minimum word length. - * @return - * TRUE if ending was at the end of the word, FALSE if not. - */ -function porterstemmer_suffix(&$word, $oldend, $newend, &$didit, $other = NULL, $minlen = 1) { - // Check to see if the ending is there - $end_regexp = '/' . $oldend . '$/'; - if (!preg_match( $end_regexp, $word )) { - // ending isn't even there - return FALSE; - } - - // Does word match other regular expression? - if ($other && !preg_match($other, $word)) { - // no match, so just return without replacing - return TRUE; - } - - // Is word long enough? - if (drupal_strlen($word) < $minlen) { - // too short, so just return without replacing - return TRUE; - } - - // Replace word ending - $word = preg_replace($end_regexp, $newend, $word); - $didit = TRUE; - return TRUE; -} - -/** - * Checks to see if a word is considered "short" in Porter Stemmer 2. - * - * A word is "short" if region R1 doesn't exist, and if it ends in a - * short syllable. A short syllable is consonant, followed by vowel, - * followed by consonant not w, x, Y; or else vowel starting a word, - * followed by a non-vowel. - * - * @param $word - * Word to check. - * @param $r1 - * Start position of R1 region in word. - * @return - * TRUE if the word is short, false if not. - */ -function porterstemmer_short_word( $word, $r1 ) { - - if (drupal_strlen($word) > $r1) { - // R1 region exists, so this is not a short word - return FALSE; - } - - // Does it end in one type of short syllable? - if (preg_match('/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . '$/', - $word)) { - return TRUE; - } - - // Does it end in the other type of short syllable? - if (preg_match('/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . - PORTERSTEMMER_NOT_VOWEL_WXY . '$/', $word)) { - return TRUE; - } - - return FALSE; -} - -/** - * Pre-processes a word for the Porter Stemmer 2 algorithm. - * - * Checks for too-short words, removes initial apostrophes, sets y to - * Y (so as not to be considered a vowel) if y is at start of word or - * after a vowel. Then calculates the position of the R1 and R2 - * regions in the word. - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r1 - * Returns the start position of the "R1" region in the word. - * @param $r2 - * Returns the start position of the "R2" region in the word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_prestemming(&$word, &$r1, &$r2) { - if (porterstemmer_too_short($word)) { - return TRUE; - } - - $tmp = $word; - - // Remove initial apostrophe - $tmp = preg_replace("/^'/", '', $tmp); - if (porterstemmer_too_short($tmp)) { - return TRUE; - } - - // y -> Y if we should treat it as consonant - $tmp = preg_replace('/^y/', 'Y', $tmp); - $before = 'not going to match'; - while ( $before != $tmp ) { - // Do this replacement one by one, to avoid unlikely yyyy issues - $before = $tmp; - // Note: do not use count param to preg_replace - added in 5.10!! - $tmp = preg_replace('/(' . PORTERSTEMMER_VOWEL . ')y/', '$1Y', - $tmp, 1); - } - - // This y/Y step should not have changed the word length - $word = $tmp; - - // Find R1 and R2. R1 is the region after the first non-vowel - // following a vowel. R2 is the region after the first non-vowel - // following a vowel in R1. - $max = drupal_strlen($word); - $r1 = $max; - $r2 = $max; - $matches = array(); - $rdef = '/^' . PORTERSTEMMER_NOT_VOWEL . '*' . - PORTERSTEMMER_VOWEL . '+(' . PORTERSTEMMER_NOT_VOWEL . ')/'; - - // Exceptions to R1: If word begins with 'gener', 'commun', or 'arsen', - // R1 is the remainder of the word. - if ( preg_match( '/^(gener|commun|arsen)/', $word, $matches )) { - $r1 = drupal_strlen( $matches[1] ); - } - elseif (preg_match( $rdef, $word, $matches, PREG_OFFSET_CAPTURE)) { - $r1 = $matches[1][1] + 1; - }; - $R1 = drupal_substr($word, $r1); - if ($R1 && preg_match( $rdef, $R1, $matches, PREG_OFFSET_CAPTURE)) { - $r2 = $r1 + $matches[1][1] + 1; - }; - - return FALSE; -} - -/** - * Turn Y back into y to undo pre-processing. - */ -function porterstemmer_poststemming(&$word) { - $word = str_replace('Y', 'y', $word); -} - -/** - * Step 0 of the algorithm: remove possessive endings. - * - * @param $word - * Word to stem, modified in place if successful. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step0(&$word) { - $tmp = $word; - $didit = FALSE; - porterstemmer_suffix($tmp, "'s'", '', $didit) OR - porterstemmer_suffix($tmp, "'s", '', $didit) OR - porterstemmer_suffix($tmp, "'", '', $didit); - - return porterstemmer_step_ending( $word, $tmp ); -} - - -/** - * Step 1a of algorithm: plurals, etc. - * - * @param $word - * Word to stem, modified in place if successful. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step1a(&$word) { - $tmp = $word; - $didit = FALSE; - - $done = porterstemmer_suffix($tmp, 'sses', 'ss', $didit); - - // ies/ied endings -- have different replacements depending on - // if there is more than one letter preceeding. So make sure to - // test/replace for both conditions. - - if ( !$done && porterstemmer_suffix($tmp, 'ies', 'ie', $didit, '/^.?ies$/')) { - if ( !$didit ) { - porterstemmer_suffix($tmp, 'ies', 'i', $didit); - } - $done = TRUE; - } - - if ( !$done && porterstemmer_suffix($tmp, 'ied', 'ie', $didit, '/^.?ied$/')) { - if ( !$didit ) { - porterstemmer_suffix($tmp, 'ied', 'i', $didit); - } - $done = TRUE; - } - - if ( !$done ) { - porterstemmer_suffix($tmp, 'ss', 'ss', $didit) OR - porterstemmer_suffix($tmp, 'us', 'us', $didit) OR - // only delete s at end of word if there is at least one vowel that - // is not immediately before the s - porterstemmer_suffix($tmp, 's', '', $didit, - '/' . PORTERSTEMMER_VOWEL . '.+s$/'); - } - - return porterstemmer_step_ending( $word, $tmp ); -} - - -/** - * Step 1b of algorithm: eed, eedly, ed, edly, ing, ingly - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r1 - * Position of start of R1 region in word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step1b(&$word, $r1) { - $tmp = $word; - $didit = FALSE; - - // Replace these endings if in R1 region - $done = ( porterstemmer_suffix($tmp, 'eedly', 'ee', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'eed', 'ee', $didit, NULL, $r1 + 3)); - - // Delete these endings if there's a vowel before the ending - $didit = FALSE; - if ( !$done ) { - porterstemmer_suffix($tmp, 'edly', '', $didit, - '/' . PORTERSTEMMER_VOWEL . '.*edly$/' ) OR - porterstemmer_suffix($tmp, 'ed', '', $didit, - '/' . PORTERSTEMMER_VOWEL . '.*ed$/' ) OR - porterstemmer_suffix($tmp, 'ingly', '', $didit, - '/' . PORTERSTEMMER_VOWEL . '.*ingly$/' ) OR - porterstemmer_suffix($tmp, 'ing', '', $didit, - '/' . PORTERSTEMMER_VOWEL . '.*ing$/' ); - } - - // If we did one of these replacements, post-process... - if ( $didit ) { - $done = porterstemmer_suffix($tmp, 'at', 'ate', $didit) OR - porterstemmer_suffix($tmp, 'bl', 'ble', $didit) OR - porterstemmer_suffix($tmp, 'iz', 'ize', $didit); - if (!$done && - preg_match('/' . PORTERSTEMMER_DOUBLE . '$/', $tmp)) { - // drop last letter if it's a double-letter ending - $tmp = drupal_substr($tmp, 0, -1); - $done = TRUE; - } - if ( !$done && porterstemmer_short_word($tmp, $r1)) { - $tmp = $tmp . 'e'; - } - } - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Step 1c of algorithm: y suffixes - * - * @param $word - * Word to stem, modified in place if successful. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step1c(&$word) { - $tmp = $word; - $didit = FALSE; - - // Replace y or Y by i if the letter before is not a vowel, - // and that non-vowel is not the beginning of the word. - - $ytest = '/.' . PORTERSTEMMER_NOT_VOWEL . '[Yy]$/'; - porterstemmer_suffix($tmp, 'Y', 'i', $didit, $ytest ) OR - porterstemmer_suffix($tmp, 'y', 'i', $didit, $ytest ); - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Step 2 of algorithm: misc endings in region R1. - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r1 - * Position of start of R1 region in word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step2(&$word, $r1) { - $tmp = $word; - $didit = FALSE; - - // Search for the longest of these suffixes, and if found in R1, replace - porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'fulness', 'ful', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'iveness', 'ive', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'ization', 'ize', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'ousness', 'ous', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'biliti', 'ble', $didit, NULL, $r1 + 6) OR - porterstemmer_suffix($tmp, 'lessli', 'less', $didit, NULL, $r1 + 6) OR - porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) OR - porterstemmer_suffix($tmp, 'aliti', 'al', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'ation', 'ate', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'alism', 'al', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'entli', 'ent', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'fulli', 'ful', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'iviti', 'ive', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'ousli', 'ous', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'abli', 'able', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'alli', 'al', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'ator', 'ate', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'anci', 'ance', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'enci', 'ence', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'izer', 'ize', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'bli', 'ble', $didit, NULL, $r1 + 3) OR - // ogi is only replaced if preceeded by l - porterstemmer_suffix($tmp, 'ogi', 'og', $didit, - '/logi$/', $r1 + 3) OR - // li is only replaced if preceeded by a valid li-ending - porterstemmer_suffix($tmp, 'li', '', $didit, - '/' . PORTERSTEMMER_LI_END . 'li$/', $r1 + 2); - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Step 3 of algorithm: misc endings in region R1. - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r1 - * Position of start of R1 region in word. - * @param $r2 - * Position of start of R2 region in word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step3(&$word, $r1, $r2) { - $tmp = $word; - $didit = FALSE; - - porterstemmer_suffix($tmp, 'ational', 'ate', $didit, NULL, $r1 + 7) OR - porterstemmer_suffix($tmp, 'tional', 'tion', $didit, NULL, $r1 + 6) OR - porterstemmer_suffix($tmp, 'alize', 'al', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'ative', '', $didit, NULL, $r2 + 5) OR - porterstemmer_suffix($tmp, 'icate', 'ic', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'iciti', 'ic', $didit, NULL, $r1 + 5) OR - porterstemmer_suffix($tmp, 'ical', 'ic', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'ness', '', $didit, NULL, $r1 + 4) OR - porterstemmer_suffix($tmp, 'ful', '', $didit, NULL, $r1 + 3); - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Step 4 of algorithm: misc endings in region R2. - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r2 - * Position of start of R2 region in word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step4(&$word, $r2) { - $tmp = $word; - $didit = FALSE; - - porterstemmer_suffix($tmp, 'ement', '', $didit, NULL, $r2 + 5) OR - porterstemmer_suffix($tmp, 'able', '', $didit, NULL, $r2 + 4) OR - porterstemmer_suffix($tmp, 'ance', '', $didit, NULL, $r2 + 4) OR - porterstemmer_suffix($tmp, 'ence', '', $didit, NULL, $r2 + 4) OR - porterstemmer_suffix($tmp, 'ible', '', $didit, NULL, $r2 + 4) OR - porterstemmer_suffix($tmp, 'ment', '', $didit, NULL, $r2 + 4) OR - porterstemmer_suffix($tmp, 'ant', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ate', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ent', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ion', '', $didit, '/[st]ion$/', $r2 + 3) OR - porterstemmer_suffix($tmp, 'ism', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'iti', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ive', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ize', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'ous', '', $didit, NULL, $r2 + 3) OR - porterstemmer_suffix($tmp, 'al', '', $didit, NULL, $r2 + 2) OR - porterstemmer_suffix($tmp, 'er', '', $didit, NULL, $r2 + 2) OR - porterstemmer_suffix($tmp, 'ic', '', $didit, NULL, $r2 + 2); - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Step 5 of algorithm: e, l endings in region R1/R2. - * - * @param $word - * Word to stem, modified in place if successful. - * @param $r1 - * Position of start of R1 region in word. - * @param $r2 - * Position of start of R2 region in word. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_step5(&$word, $r1, $r2) { - $tmp = $word; - $didit = FALSE; - $done = FALSE; - - // Delete l at end of word if in R2 and preceded by another l - $done = porterstemmer_suffix( $tmp, 'll', 'l', $didit, NULL, $r2 + 1 ); - - // Delete e at end of word if in R2, or in R1 and not preceded by - // a short syllable - $len = drupal_strlen( $tmp ); - if ( !$done && preg_match( '/e$/', $tmp ) && - ( $len > $r2 || - ( $len > $r1 && - !preg_match( '/^' . PORTERSTEMMER_VOWEL . PORTERSTEMMER_NOT_VOWEL . - 'e$/', $tmp ) && - !preg_match( '/' . PORTERSTEMMER_NOT_VOWEL . PORTERSTEMMER_VOWEL . - PORTERSTEMMER_NOT_VOWEL_WXY . 'e$/', $tmp )))) { - $tmp = drupal_substr( $tmp, 0, -1 ); - } - - return porterstemmer_step_ending($word, $tmp); -} - -/** - * Checks exceptions for Porter Stemmer. - * - * @param $word - * Word to stem, modified in place if successful. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_exception1(&$word) { - // Special cases for stemming. Don't add anything in this list that - // is shorter than the minimum allowed length! - $repl = array( - 'skis' => 'ski', - 'skies' => 'sky', - 'dying' => 'die', - 'lying' => 'lie', - 'tying' => 'tie', - 'idly' => 'idl', - 'gently' => 'gentl', - 'ugly' => 'ugli', - 'early' => 'earli', - 'only' => 'onli', - 'singly' => 'singl', - 'sky' => 'sky', - 'news' => 'news', - 'howe' => 'howe', - 'atlas' => 'atlas', - 'cosmos' => 'cosmos', - 'bias' => 'bias', - 'andes' => 'andes', - ); - - // If our word is in that list, we're done. - if ( isset( $repl[ $word ])) { - $word = $repl[ $word ]; - return TRUE; - } - - return FALSE; -} - -/** - * Checks exceptions for Porter Stemmer after Step 1a. - * - * @param $word - * Word to stem, modified in place if successful. - * @return - * TRUE if it is time to stop stemming, FALSE to continue. - */ -function porterstemmer_exception2(&$word) { - // The following words are to be left invariant. - $repl = array( - 'inning' => 1, - 'outing' => 1, - 'canning' => 1, - 'herring' => 1, - 'earring' => 1, - 'proceed' => 1, - 'exceed' => 1, - 'succeed' => 1, - ); - - if ( isset( $repl[ $word ])) { - return TRUE; - } - - return FALSE; -} diff --git a/src/Porter2.php b/src/Porter2.php new file mode 100644 index 0000000..f26937c --- /dev/null +++ b/src/Porter2.php @@ -0,0 +1,623 @@ + 'ski', + 'skies' => 'sky', + 'dying' => 'die', + 'lying' => 'lie', + 'tying' => 'tie', + 'idly' => 'idl', + 'gently' => 'gentl', + 'ugly' => 'ugli', + 'early' => 'earli', + 'only' => 'onli', + 'singly' => 'singl', + 'sky' => 'sky', + 'news' => 'news', + 'howe' => 'howe', + 'atlas' => 'atlas', + 'cosmos' => 'cosmos', + 'bias' => 'bias', + 'andes' => 'andes', + ); + + // Process exceptions. + if (isset($exceptions[$word])) { + $word = $exceptions[$word]; + } + elseif (strlen($word) > 2) { + // Only execute algorithm on words that are longer than two letters. + $word = self::prepare($word); + $word = self::step0($word); + $word = self::step1a($word); + $word = self::step1b($word); + $word = self::step1c($word); + $word = self::step2($word); + $word = self::step3($word); + $word = self::step4($word); + $word = self::step5($word); + } + return strtolower($word); + } + + /** + * Set initial y, or y after a vowel, to Y. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The prepared word. + */ + protected static function prepare($word) { + $inc = 0; + if (strpos($word, "'") === 0) { + $word = substr($word, 1); + } + while ($inc <= strlen($word)) { + if (substr($word, $inc, 1) === 'y' && ($inc == 0 || self::isVowel($inc - 1, $word))) { + $word = substr_replace($word, 'Y', $inc, 1); + } + $inc++; + } + return $word; + } + + /** + * Search for the longest among the "s" suffixes and removes it. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step0($word) { + $found = FALSE; + $checks = array("'s'", "'s", "'"); + foreach ($checks as $check) { + if (!$found && self::hasEnding($word, $check)) { + $word = self::removeEnding($word, $check); + $found = TRUE; + } + } + return $word; + } + + /** + * Handles various suffixes, of which the longest is replaced. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step1a($word) { + $found = FALSE; + if (self::hasEnding($word, 'sses')) { + $word = self::removeEnding($word, 'sses') . 'ss'; + $found = TRUE; + } + $checks = array('ied', 'ies'); + foreach ($checks as $check) { + if (!$found && self::hasEnding($word, $check)) { + // @todo: check order here. + $length = strlen($word); + $word = self::removeEnding($word, $check); + if ($length > 4) { + $word .= 'i'; + } + else { + $word .= 'ie'; + } + $found = TRUE; + } + } + if (self::hasEnding($word, 'us') || self::hasEnding($word, 'ss')) { + $found = TRUE; + } + // Delete if preceding word part has a vowel not immediately before the s. + if (!$found && self::hasEnding($word, 's') && self::containsVowel(substr($word, 0, -2))) { + $word = self::removeEnding($word, 's'); + } + return $word; + } + + /** + * Handles various suffixes, of which the longest is replaced. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step1b($word) { + $exceptions = array( + 'inning', + 'outing', + 'canning', + 'herring', + 'earring', + 'proceed', + 'exceed', + 'succeed', + ); + if (in_array($word, $exceptions)) { + return $word; + } + $checks = array('eedly', 'eed'); + foreach ($checks as $check) { + if (self::hasEnding($word, $check)) { + if (self::r($word, 1) !== strlen($word)) { + $word = self::removeEnding($word, $check) . 'ee'; + } + return $word; + } + } + $checks = array('ingly', 'edly', 'ing', 'ed'); + $second_endings = array('at', 'bl', 'iz'); + foreach ($checks as $check) { + // If the ending is present and the previous part contains a vowel. + if (self::hasEnding($word, $check) && self::containsVowel(substr($word, 0, -strlen($check)))) { + $word = self::removeEnding($word, $check); + foreach ($second_endings as $ending) { + if (self::hasEnding($word, $ending)) { + return $word . 'e'; + } + } + // If the word ends with a double, remove the last letter. + $double_removed = self::removeDoubles($word); + if ($double_removed != $word) { + $word = $double_removed; + } + elseif (self::isShort($word)) { + // If the word is short, add e (so hop -> hope). + $word .= 'e'; + } + return $word; + } + } + return $word; + } + + /** + * Replaces suffix y or Y with i if after non-vowel not @ word begin. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step1c($word) { + if ((self::hasEnding($word, 'y') || self::hasEnding($word, 'Y')) && strlen($word) > 2 && !(self::isVowel(strlen($word) - 2, $word))) { + $word = self::removeEnding($word, 'y'); + $word .= 'i'; + } + return $word; + } + + /** + * Implements step 2 of the Porter2 algorithm. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step2($word) { + $checks = array( + "ization" => "ize", + "iveness" => "ive", + "fulness" => "ful", + "ational" => "ate", + "ousness" => "ous", + "biliti" => "ble", + "tional" => "tion", + "lessli" => "less", + "fulli" => "ful", + "entli" => "ent", + "ation" => "ate", + "aliti" => "al", + "iviti" => "ive", + "ousli" => "ous", + "alism" => "al", + "abli" => "able", + "anci" => "ance", + "alli" => "al", + "izer" => "ize", + "enci" => "ence", + "ator" => "ate", + "bli" => "ble", + "ogi" => "og", + ); + foreach ($checks as $find => $replace) { + if (self::hasEnding($word, $find)) { + if (self::inR1($word, $find)) { + $word = self::removeEnding($word, $find) . $replace; + } + return $word; + } + } + if (self::hasEnding($word, 'li')) { + if (strlen($word) > 4 && self::validLi(self::charAt(-3, $word))) { + $word = self::removeEnding($word, 'li'); + } + } + return $word; + } + + /** + * Implements step 3 of the Porter2 algorithm. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step3($word) { + $checks = array( + 'ational' => 'ate', + 'tional' => 'tion', + 'alize' => 'al', + 'icate' => 'ic', + 'iciti' => 'ic', + 'ical' => 'ic', + 'ness' => '', + 'ful' => '', + ); + foreach ($checks as $find => $replace) { + if (self::hasEnding($word, $find)) { + if (self::inR1($word, $find)) { + $word = self::removeEnding($word, $find) . $replace; + } + return $word; + } + } + if (self::hasEnding($word, 'ative')) { + if (self::inR2($word, 'ative')) { + $word = self::removeEnding($word, 'ative'); + } + } + return $word; + } + + /** + * Implements step 4 of the Porter2 algorithm. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step4($word) { + $checks = array( + 'ement', + 'ment', + 'ance', + 'ence', + 'able', + 'ible', + 'ant', + 'ent', + 'ion', + 'ism', + 'ate', + 'iti', + 'ous', + 'ive', + 'ize', + 'al', + 'er', + 'ic', + ); + foreach ($checks as $check) { + // Among the suffixes, if found and in R2, delete. + if (self::hasEnding($word, $check)) { + if (self::inR2($word, $check)) { + if ($check !== 'ion' || in_array(self::charAt(-4, $word), array('s', 't'))) { + $word = self::removeEnding($word, $check); + } + } + return $word; + } + } + return $word; + } + + /** + * Implements step 5 of the Porter2 algorithm. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function step5($word) { + if (self::hasEnding($word, 'e')) { + // Delete if in R2, or in R1 and not preceded by a short syllable. + if (self::inR2($word, 'e') || (self::inR1($word, 'e') && !self::isShortSyllable($word, strlen($word) - 3))) { + $word = self::removeEnding($word, 'e'); + } + return $word; + } + if (self::hasEnding($word, 'l')) { + // Delete if in R2 and preceded by l. + if (self::inR2($word, 'l') && self::charAt(-2, $word) == 'l') { + $word = self::removeEnding($word, 'l'); + } + } + return $word; + } + + /** + * Removes certain double consonants from the word's end. + * + * @param string $word + * The word to stem. + * + * @return string $word + * The modified word. + */ + protected static function removeDoubles($word) { + $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'); + foreach ($doubles as $double) { + if (substr($word, -2) == $double) { + $word = substr($word, 0, -1); + break; + } + } + return $word; + } + + /** + * Checks whether a character is a vowel. + * + * @param int $position + * The character's position. + * @param string $word + * The word in which to check. + * @param string[] $additional + * (optional) Additional characters that should count as vowels. + * + * @return bool + * TRUE if the character is a vowel, FALSE otherwise. + */ + protected static function isVowel($position, $word, $additional = array()) { + $vowels = array_merge(array('a', 'e', 'i', 'o', 'u', 'y'), $additional); + return in_array(self::charAt($position, $word), $vowels); + } + + /** + * Retrieves the character at the given position. + * + * @param int $position + * The 0-based index of the character. If a negative number is given, the + * position is counted from the end of the string. + * @param string $word + * The word from which to retrieve the character. + * + * @return string + * The character at the given position, or an empty string if the given + * position was illegal. + */ + protected static function charAt($position, $word) { + $length = strlen($word); + if (abs($position) >= $length) { + return ''; + } + if ($position < 0) { + $position += $length; + } + return $word[$position]; + } + + /** + * Determines whether the word ends in a "vowel-consonant" suffix. + * + * Unless the word is only two characters long, it also checks that the + * third-last character is neither "w", "x" nor "Y". + * + * @param int|null $position + * (optional) If given, do not check the end of the word, but the character + * at the given position, and the next one. + * + * @return bool + * TRUE if the word has the described suffix, FALSE otherwise. + */ + protected static function isShortSyllable($word, $position = NULL) { + if ($position === NULL) { + $position = strlen($word) - 2; + } + // A vowel at the beginning of the word followed by a non-vowel. + if ($position === 0) { + return self::isVowel(0, $word) && !self::isVowel(1, $word); + } + // Vowel followed by non-vowel other than w, x, Y and preceded by + // non-vowel. + $additional = array('w', 'x', 'Y'); + return !self::isVowel($position - 1, $word) && self::isVowel($position, $word) && !self::isVowel($position + 1, $word, $additional); + } + + /** + * Determines whether the word is short. + * + * A word is called short if it ends in a short syllable and if R1 is null. + * + * @return bool + * TRUE if the word is short, FALSE otherwise. + */ + protected static function isShort($word) { + return self::isShortSyllable($word) && self::r($word, 1) == strlen($word); + } + + /** + * Determines the start of a certain "R" region. + * + * R is a region after the first non-vowel following a vowel, or end of word. + * + * @param int $type + * (optional) 1 or 2. If 2, then calculate the R after the R1. + * + * @return int + * The R position. + */ + protected static function r($word, $type = 1) { + $inc = 1; + if ($type === 2) { + $inc = self::r($word, 1); + } + elseif (strlen($word) > 5) { + $prefix_5 = substr($word, 0, 5); + if ($prefix_5 === 'gener' || $prefix_5 === 'arsen') { + return 5; + } + if (strlen($word) > 5 && substr($word, 0, 6) === 'commun') { + return 6; + } + } + + while ($inc <= strlen($word)) { + if (!self::isVowel($inc, $word) && self::isVowel($inc - 1, $word)) { + $position = $inc; + break; + } + $inc++; + } + if (!isset($position)) { + $position = strlen($word); + } + else { + // We add one, as this is the position AFTER the first non-vowel. + $position++; + } + return $position; + } + + /** + * Checks whether the given string is contained in R1. + * + * @param string $string + * The string. + * + * @return bool + * TRUE if the string is in R1, FALSE otherwise. + */ + protected static function inR1($word, $string) { + $r1 = substr($word, self::r($word, 1)); + return strpos($r1, $string) !== FALSE; + } + + /** + * Checks whether the given string is contained in R2. + * + * @param string $string + * The string. + * + * @return bool + * TRUE if the string is in R2, FALSE otherwise. + */ + protected static function inR2($word, $string) { + $r2 = substr($word, self::r($word, 2)); + return strpos($r2, $string) !== FALSE; + } + + /** + * Checks whether the word ends with the given string. + * + * @param string $string + * The string. + * + * @return bool + * TRUE if the word ends with the given string, FALSE otherwise. + */ + protected static function hasEnding($word, $string) { + $length = strlen($string); + if ($length > strlen($word)) { + return FALSE; + } + return (substr_compare($word, $string, -1 * $length, $length) === 0); + } + + /** + * Removes a given string from the end of the current word. + * + * Does not check whether the ending is actually there. + * + * @param string $string + * The ending to remove. + */ + protected static function removeEnding($word, $string) { + return substr($word, 0, -strlen($string)); + } + + /** + * Checks whether the given string contains a vowel. + * + * @param string $string + * The string to check. + * + * @return bool + * TRUE if the string contains a vowel, FALSE otherwise. + */ + protected static function containsVowel($string) { + $inc = 0; + $return = FALSE; + while ($inc < strlen($string)) { + if (self::isVowel($inc, $string)) { + $return = TRUE; + break; + } + $inc++; + } + return $return; + } + + /** + * Checks whether the given string is a valid -li prefix. + * + * @param string $string + * The string to check. + * + * @return bool + * TRUE if the given string is a valid -li prefix, FALSE otherwise. + */ + protected static function validLi($string) { + return in_array($string, array( + 'c', + 'd', + 'e', + 'g', + 'h', + 'k', + 'm', + 'n', + 'r', + 't', + )); + } + +} diff --git a/src/Tests/LangCodeTest.php b/src/Tests/LangCodeTest.php new file mode 100644 index 0000000..9926d59 --- /dev/null +++ b/src/Tests/LangCodeTest.php @@ -0,0 +1,160 @@ + 'I walk through the streets, looking around for trouble.', + 'Second Page' => 'I walked home from work today.', + 'Third Page' => 'I am always walking everywhere.', + ); + + /** + * An array of search terms. + * + * @var string[] + */ + protected $searches = array( + 'walk', + 'walked', + 'walking', + ); + + /** + * An array of nodes created for testing purposes. + * + * @var \Drupal\node\NodeInterface[] + */ + protected $nodes; + + /** + * {@inheritdoc} + */ + protected function setUp() { + parent::setUp(); + + $this->testUser = $this->drupalCreateUser(array( + 'search content', + 'access content', + 'administer nodes', + 'access site reports', + 'use advanced search', + 'administer languages', + 'access administration pages', + 'administer site configuration', + )); + $this->drupalLogin($this->testUser); + + // Add a new language. + ConfigurableLanguage::createFromLangcode('fr')->save(); + + // Make the body field translatable. The title is already translatable by + // definition. + $field_storage = FieldStorageConfig::loadByName('node', 'body'); + $field_storage->setTranslatable(TRUE); + $field_storage->save(); + + // Create EN language nodes. + foreach ($this->test_data as $title => $body) { + $info = array( + 'title' => $title . ' (EN)', + 'body' => array(array('value' => $body)), + 'type' => 'page', + 'langcode' => 'en', + ); + $this->nodes[$title] = $this->drupalCreateNode($info); + } + + // Create non-EN nodes. + foreach ($this->test_data as $title => $body) { + $info = array( + 'title' => $title . ' (FR)', + 'body' => array(array('value' => $body)), + 'type' => 'page', + 'langcode' => 'fr', + ); + $this->nodes[$title] = $this->drupalCreateNode($info); + } + + // Create language-unspecified nodes. + foreach ($this->test_data as $title => $body) { + $info = array( + 'title' => $title . ' (UND)', + 'body' => array(array('value' => $body)), + 'type' => 'page', + 'langcode' => LanguageInterface::LANGCODE_NOT_SPECIFIED, + ); + $this->nodes[$title] = $this->drupalCreateNode($info); + } + + // Run cron to ensure the content is indexed. + $this->cronRun(); + $this->drupalGet('admin/reports/dblog'); + $this->assertText(t('Cron run completed'), 'Log shows cron run completed'); + } + + /** + * Test that search variations return English language results. + */ + protected function testStemSearching() { + + foreach ($this->searches as $search) { + $this->drupalPostForm('search/node', array('keys' => $search), t('Search')); + + // Verify that all English-language test node variants show up in results. + foreach ($this->test_data as $title => $body) { + $this->assertText($title . ' (EN)', format_string('Search for %search returns English-language node with body %body', array('%search' => $search, '%body' => $body))); + } + + // Check for results by language. + switch ($search) { + case 'walk': + $this->assertNoText('Second Page (FR)', format_string('Search for %search does not show stemmed non-English results.', array('%search' => $search))); + $this->assertNoText('Second Page (UND)', format_string('Search for %search does show stemmed language-unspecified results.', array('%search' => $search))); + break; + + case 'walked': + $this->assertNoText('Second Page (FR)', format_string('Search for %search does not show stemmed non-English results.', array('%search' => $search))); + $this->assertNoText('Second Page (UND)', format_string('Search for %search does not show stemmed language-unspecified results.', array('%search' => $search))); + break; + + case 'walking': + $this->assertText('First Page (FR)', format_string('Search for %search does show matching non-English results.', array('%search' => $search))); + $this->assertText('First Page (UND)', format_string('Search for %search does show matching language-unspecified results.', array('%search' => $search))); + break; + + } + } + } + +} diff --git a/tests/src/Unit/Porter2Pecl1.php b/tests/src/Unit/Porter2Pecl1.php new file mode 100644 index 0000000..b5a9a45 --- /dev/null +++ b/tests/src/Unit/Porter2Pecl1.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(0); + } + +} diff --git a/tests/src/Unit/Porter2Pecl2.php b/tests/src/Unit/Porter2Pecl2.php new file mode 100644 index 0000000..2cae41c --- /dev/null +++ b/tests/src/Unit/Porter2Pecl2.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(5000); + } + +} diff --git a/tests/src/Unit/Porter2Pecl3.php b/tests/src/Unit/Porter2Pecl3.php new file mode 100644 index 0000000..17c95d7 --- /dev/null +++ b/tests/src/Unit/Porter2Pecl3.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(10000); + } + +} diff --git a/tests/src/Unit/Porter2Pecl4.php b/tests/src/Unit/Porter2Pecl4.php new file mode 100644 index 0000000..bab001f --- /dev/null +++ b/tests/src/Unit/Porter2Pecl4.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(15000); + } + +} diff --git a/tests/src/Unit/Porter2Pecl5.php b/tests/src/Unit/Porter2Pecl5.php new file mode 100644 index 0000000..719ade1 --- /dev/null +++ b/tests/src/Unit/Porter2Pecl5.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(20000); + } + +} diff --git a/tests/src/Unit/Porter2Pecl6.php b/tests/src/Unit/Porter2Pecl6.php new file mode 100644 index 0000000..509bae0 --- /dev/null +++ b/tests/src/Unit/Porter2Pecl6.php @@ -0,0 +1,42 @@ +has_pecl_stem) { + $this->assertEquals($stem, stem_english($word)); + } + else { + $this->assertTrue(FALSE, 'No PECL stem library found, Aborting test.'); + } + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(25000); + } + +} diff --git a/tests/src/Unit/Porter2Test1.php b/tests/src/Unit/Porter2Test1.php new file mode 100644 index 0000000..a48dbca --- /dev/null +++ b/tests/src/Unit/Porter2Test1.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(0, 5000); + } + +} diff --git a/tests/src/Unit/Porter2Test2.php b/tests/src/Unit/Porter2Test2.php new file mode 100644 index 0000000..6849446 --- /dev/null +++ b/tests/src/Unit/Porter2Test2.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(5000, 5000); + } + +} diff --git a/tests/src/Unit/Porter2Test3.php b/tests/src/Unit/Porter2Test3.php new file mode 100644 index 0000000..2cfcd29 --- /dev/null +++ b/tests/src/Unit/Porter2Test3.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(10000, 5000); + } + +} diff --git a/tests/src/Unit/Porter2Test4.php b/tests/src/Unit/Porter2Test4.php new file mode 100644 index 0000000..998aa34 --- /dev/null +++ b/tests/src/Unit/Porter2Test4.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(15000, 5000); + } + +} diff --git a/tests/src/Unit/Porter2Test5.php b/tests/src/Unit/Porter2Test5.php new file mode 100644 index 0000000..99eb3ac --- /dev/null +++ b/tests/src/Unit/Porter2Test5.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(20000, 5000); + } + +} diff --git a/tests/src/Unit/Porter2Test6.php b/tests/src/Unit/Porter2Test6.php new file mode 100644 index 0000000..4e5dff1 --- /dev/null +++ b/tests/src/Unit/Porter2Test6.php @@ -0,0 +1,43 @@ +assertEquals($stem, Porter2::stem($word)); + } + + /** + * Data provider for testStem(). + * + * @return array + * Nested arrays of values to check: + * - $word + * - $stem + */ + public function stemDataProvider() { + return $this->retrieveStemWords(25000, 5000); + } + +} diff --git a/tests/src/Unit/PorterPeclBase.php b/tests/src/Unit/PorterPeclBase.php new file mode 100644 index 0000000..3e8fb05 --- /dev/null +++ b/tests/src/Unit/PorterPeclBase.php @@ -0,0 +1,31 @@ +has_pecl_stem = extension_loaded('stem') && function_exists('stem_english'); + } + +} diff --git a/tests/src/Unit/TestItemsTrait.php b/tests/src/Unit/TestItemsTrait.php new file mode 100644 index 0000000..92a453f --- /dev/null +++ b/tests/src/Unit/TestItemsTrait.php @@ -0,0 +1,53 @@ +