252 lines
6.0 KiB
JavaScript
252 lines
6.0 KiB
JavaScript
/**
|
||
* @license Apache-2.0
|
||
*
|
||
* Copyright (c) 2019 The Stdlib Authors.
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*
|
||
*
|
||
* ## Notice
|
||
*
|
||
* This code is a modification of an existing JavaScript implementation of ther [Porter stemming algorithm]{@link https://tartarus.org/martin/PorterStemmer/}.
|
||
*
|
||
* ```text
|
||
* Release 1 be 'andargor', Jul 2004
|
||
* Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
|
||
* ```
|
||
*/
|
||
|
||
'use strict';
|
||
|
||
// MODULES //
|
||
|
||
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
|
||
var endsWith = require( '@stdlib/string/ends-with' );
|
||
var lowercase = require( '@stdlib/string/lowercase' );
|
||
var replace = require( '@stdlib/string/replace' );
|
||
|
||
|
||
// VARIABLES //
|
||
|
||
var step2list = {
|
||
'ational': 'ate',
|
||
'tional': 'tion',
|
||
'enci': 'ence',
|
||
'anci': 'ance',
|
||
'izer': 'ize',
|
||
'bli': 'ble',
|
||
'alli': 'al',
|
||
'entli': 'ent',
|
||
'eli': 'e',
|
||
'ousli': 'ous',
|
||
'ization': 'ize',
|
||
'ation': 'ate',
|
||
'ator': 'ate',
|
||
'alism': 'al',
|
||
'iveness': 'ive',
|
||
'fulness': 'ful',
|
||
'ousness': 'ous',
|
||
'aliti': 'al',
|
||
'iviti': 'ive',
|
||
'biliti': 'ble',
|
||
'logi': 'log'
|
||
};
|
||
var step3list = {
|
||
'icate': 'ic',
|
||
'ative': '',
|
||
'alize': 'al',
|
||
'iciti': 'ic',
|
||
'ical': 'ic',
|
||
'ful': '',
|
||
'ness': ''
|
||
};
|
||
var c = '[^aeiou]'; // consonant
|
||
var v = '[aeiouy]'; // vowel
|
||
var C = c + '[^aeiouy]*'; // consonant sequence
|
||
var V = v + '[aeiou]*'; // vowel sequence
|
||
var RE_CV = new RegExp( '^' + C + v + '[^aeiouwxy]$' );
|
||
var mgr0 = '^(' + C + ')?' + V + C; // [C]VC... is m>0
|
||
var RE_MGR0 = new RegExp( mgr0 );
|
||
var meq1 = '^(' + C + ')?' + V + C + '(' + V + ')?$'; // [C]VC[V] is m=1
|
||
var RE_MEQ1 = new RegExp( meq1 );
|
||
var mgr1 = '^(' + C + ')?' + V + C + V + C; // [C]VCVC... is m>1
|
||
var RE_MGR1 = new RegExp( mgr1 );
|
||
var sV = '^(' + C + ')?' + v; // vowel in stem
|
||
var RE_SV = new RegExp( sV );
|
||
var RE_STEP1A = /^(.+?)(ss|i)es$/;
|
||
var RE2_STEP1A = /^(.+?)([^s])s$/;
|
||
var RE_STEP1B = /^(.+?)eed$/;
|
||
var RE2_STEP1B = /^(.+?)(ed|ing)$/;
|
||
var RE_STEP1C = /^(.+?)y$/;
|
||
var RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
||
var RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
||
var RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
||
var RE2_STEP4 = /^(.+?)(s|t)(ion)$/;
|
||
var RE_STEP5 = /^(.+?)e$/;
|
||
var RE_LAST = /.$/;
|
||
var RE_ATBLIZ = /(at|bl|iz)$/;
|
||
var RE_DOUBLE = new RegExp( '([^aeiouylsz])\\1$' );
|
||
|
||
|
||
// MAIN //
|
||
|
||
/**
|
||
* Extracts the stem of a given word using the Porter stemming algorithm.
|
||
*
|
||
* ## References
|
||
*
|
||
* - Porter, Michael F. 1980. "An algorithm for suffix stripping." _Program_ 13 (3): 130–37. doi:[10.1108/eb046814][@porter:1980].
|
||
*
|
||
* [@porter:1980]: https://doi.org/10.1108/eb046814
|
||
*
|
||
* @param {string} word - input word
|
||
* @throws {TypeError} first argument must be a string primitive
|
||
* @returns {string} word stem
|
||
*
|
||
* @example
|
||
* var out = porterStemmer( 'walking' );
|
||
* // returns 'walk'
|
||
*
|
||
* @example
|
||
* var out = porterStemmer( 'walked' );
|
||
* // returns 'walk'
|
||
*
|
||
* @example
|
||
* var out = porterStemmer( 'walks' );
|
||
* // returns 'walk'
|
||
*
|
||
* @example
|
||
* var out = porterStemmer( 'worldwide' );
|
||
* // returns 'worldwid'
|
||
*
|
||
* @example
|
||
* var out = porterStemmer( '' );
|
||
* // returns ''
|
||
*/
|
||
function porterStemmer( word ) {
|
||
var firstch;
|
||
var suffix;
|
||
var stem;
|
||
var fp;
|
||
|
||
if ( !isString( word ) ) {
|
||
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + word + '`.' );
|
||
}
|
||
if ( word.length < 3 ) {
|
||
return word;
|
||
}
|
||
word = lowercase( word );
|
||
firstch = word[ 0 ];
|
||
if ( firstch === 'y' ) {
|
||
word = firstch.toUpperCase() + word.substr( 1 );
|
||
}
|
||
|
||
// Step 1a:
|
||
if ( RE_STEP1A.test( word ) ) {
|
||
word = replace( word, RE_STEP1A, '$1$2' );
|
||
} else if ( RE2_STEP1A.test( word ) ) {
|
||
word = replace( word, RE2_STEP1A, '$1$2' );
|
||
}
|
||
|
||
// Step 1b:
|
||
if ( RE_STEP1B.test( word ) ) {
|
||
fp = RE_STEP1B.exec( word );
|
||
if ( RE_MGR0.test( fp[ 1 ] ) ) {
|
||
word = replace( word, RE_LAST, '' );
|
||
}
|
||
} else if ( RE2_STEP1B.test( word ) ) {
|
||
fp = RE2_STEP1B.exec( word );
|
||
stem = fp[ 1 ];
|
||
if ( RE_SV.test( stem ) ) {
|
||
word = stem;
|
||
if ( RE_ATBLIZ.test( word ) ) {
|
||
word += 'e';
|
||
} else if ( RE_DOUBLE.test( word ) ) {
|
||
word = replace( word, RE_LAST, '' );
|
||
} else if ( RE_CV.test( word ) ) {
|
||
word += 'e';
|
||
}
|
||
}
|
||
}
|
||
|
||
// Step 1c:
|
||
if ( RE_STEP1C.test( word ) ) {
|
||
fp = RE_STEP1C.exec( word );
|
||
stem = fp[ 1 ];
|
||
if ( RE_SV.test( stem ) ) {
|
||
word = stem + 'i';
|
||
}
|
||
}
|
||
|
||
// Step 2:
|
||
if ( RE_STEP2.test( word ) ) {
|
||
fp = RE_STEP2.exec( word );
|
||
stem = fp[ 1 ];
|
||
suffix = fp[ 2 ];
|
||
if ( RE_MGR0.test( stem ) ) {
|
||
word = stem + step2list[ suffix ];
|
||
}
|
||
}
|
||
|
||
// Step 3:
|
||
if ( RE_STEP3.test( word ) ) {
|
||
fp = RE_STEP3.exec( word );
|
||
stem = fp[ 1 ];
|
||
suffix = fp[ 2 ];
|
||
if ( RE_MGR0.test( stem ) ) {
|
||
word = stem + step3list[ suffix ];
|
||
}
|
||
}
|
||
|
||
// Step 4:
|
||
if ( RE_STEP4.test( word ) ) {
|
||
fp = RE_STEP4.exec( word );
|
||
stem = fp[ 1 ];
|
||
if ( RE_MGR1.test( stem ) ) {
|
||
word = stem;
|
||
}
|
||
} else if ( RE2_STEP4.test( word ) ) {
|
||
fp = RE2_STEP4.exec( word );
|
||
stem = fp[ 1 ] + fp[ 2 ];
|
||
if ( RE_MGR1.test( stem ) ) {
|
||
word = stem;
|
||
}
|
||
}
|
||
|
||
// Step 5:
|
||
if ( RE_STEP5.test( word ) ) {
|
||
fp = RE_STEP5.exec( word );
|
||
stem = fp[ 1 ];
|
||
if (
|
||
RE_MGR1.test( stem ) ||
|
||
( RE_MEQ1.test( stem ) && !( RE_CV.test( stem ) ) )
|
||
) {
|
||
word = stem;
|
||
}
|
||
}
|
||
if ( endsWith( word, 'll' ) && RE_MGR1.test( word ) ) {
|
||
word = replace( word, RE_LAST, '' );
|
||
}
|
||
|
||
// Turn initial Y back to y:
|
||
if ( firstch === 'y' ) {
|
||
word = firstch.toLowerCase() + word.substr( 1 );
|
||
}
|
||
return word;
|
||
}
|
||
|
||
|
||
// EXPORTS //
|
||
|
||
module.exports = porterStemmer;
|