252 lines
6.0 KiB
JavaScript
252 lines
6.0 KiB
JavaScript
|
/**
|
|||
|
* @license Apache-2.0
|
|||
|
*
|
|||
|
* Copyright (c) 2019 The Stdlib Authors.
|
|||
|
*
|
|||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|||
|
* you may not use this file except in compliance with the License.
|
|||
|
* You may obtain a copy of the License at
|
|||
|
*
|
|||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
*
|
|||
|
* Unless required by applicable law or agreed to in writing, software
|
|||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
|
* See the License for the specific language governing permissions and
|
|||
|
* limitations under the License.
|
|||
|
*
|
|||
|
*
|
|||
|
* ## Notice
|
|||
|
*
|
|||
|
* This code is a modification of an existing JavaScript implementation of ther [Porter stemming algorithm]{@link https://tartarus.org/martin/PorterStemmer/}.
|
|||
|
*
|
|||
|
* ```text
|
|||
|
* Release 1 be 'andargor', Jul 2004
|
|||
|
* Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
|
|||
|
* ```
|
|||
|
*/
|
|||
|
|
|||
|
'use strict';
|
|||
|
|
|||
|
// MODULES //
|
|||
|
|
|||
|
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
|
|||
|
var endsWith = require( '@stdlib/string/ends-with' );
|
|||
|
var lowercase = require( '@stdlib/string/lowercase' );
|
|||
|
var replace = require( '@stdlib/string/replace' );
|
|||
|
|
|||
|
|
|||
|
// VARIABLES //
|
|||
|
|
|||
|
var step2list = {
|
|||
|
'ational': 'ate',
|
|||
|
'tional': 'tion',
|
|||
|
'enci': 'ence',
|
|||
|
'anci': 'ance',
|
|||
|
'izer': 'ize',
|
|||
|
'bli': 'ble',
|
|||
|
'alli': 'al',
|
|||
|
'entli': 'ent',
|
|||
|
'eli': 'e',
|
|||
|
'ousli': 'ous',
|
|||
|
'ization': 'ize',
|
|||
|
'ation': 'ate',
|
|||
|
'ator': 'ate',
|
|||
|
'alism': 'al',
|
|||
|
'iveness': 'ive',
|
|||
|
'fulness': 'ful',
|
|||
|
'ousness': 'ous',
|
|||
|
'aliti': 'al',
|
|||
|
'iviti': 'ive',
|
|||
|
'biliti': 'ble',
|
|||
|
'logi': 'log'
|
|||
|
};
|
|||
|
var step3list = {
|
|||
|
'icate': 'ic',
|
|||
|
'ative': '',
|
|||
|
'alize': 'al',
|
|||
|
'iciti': 'ic',
|
|||
|
'ical': 'ic',
|
|||
|
'ful': '',
|
|||
|
'ness': ''
|
|||
|
};
|
|||
|
var c = '[^aeiou]'; // consonant
|
|||
|
var v = '[aeiouy]'; // vowel
|
|||
|
var C = c + '[^aeiouy]*'; // consonant sequence
|
|||
|
var V = v + '[aeiou]*'; // vowel sequence
|
|||
|
var RE_CV = new RegExp( '^' + C + v + '[^aeiouwxy]$' );
|
|||
|
var mgr0 = '^(' + C + ')?' + V + C; // [C]VC... is m>0
|
|||
|
var RE_MGR0 = new RegExp( mgr0 );
|
|||
|
var meq1 = '^(' + C + ')?' + V + C + '(' + V + ')?$'; // [C]VC[V] is m=1
|
|||
|
var RE_MEQ1 = new RegExp( meq1 );
|
|||
|
var mgr1 = '^(' + C + ')?' + V + C + V + C; // [C]VCVC... is m>1
|
|||
|
var RE_MGR1 = new RegExp( mgr1 );
|
|||
|
var sV = '^(' + C + ')?' + v; // vowel in stem
|
|||
|
var RE_SV = new RegExp( sV );
|
|||
|
var RE_STEP1A = /^(.+?)(ss|i)es$/;
|
|||
|
var RE2_STEP1A = /^(.+?)([^s])s$/;
|
|||
|
var RE_STEP1B = /^(.+?)eed$/;
|
|||
|
var RE2_STEP1B = /^(.+?)(ed|ing)$/;
|
|||
|
var RE_STEP1C = /^(.+?)y$/;
|
|||
|
var RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
|||
|
var RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
|||
|
var RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
|||
|
var RE2_STEP4 = /^(.+?)(s|t)(ion)$/;
|
|||
|
var RE_STEP5 = /^(.+?)e$/;
|
|||
|
var RE_LAST = /.$/;
|
|||
|
var RE_ATBLIZ = /(at|bl|iz)$/;
|
|||
|
var RE_DOUBLE = new RegExp( '([^aeiouylsz])\\1$' );
|
|||
|
|
|||
|
|
|||
|
// MAIN //
|
|||
|
|
|||
|
/**
|
|||
|
* Extracts the stem of a given word using the Porter stemming algorithm.
|
|||
|
*
|
|||
|
* ## References
|
|||
|
*
|
|||
|
* - Porter, Michael F. 1980. "An algorithm for suffix stripping." _Program_ 13 (3): 130–37. doi:[10.1108/eb046814][@porter:1980].
|
|||
|
*
|
|||
|
* [@porter:1980]: https://doi.org/10.1108/eb046814
|
|||
|
*
|
|||
|
* @param {string} word - input word
|
|||
|
* @throws {TypeError} first argument must be a string primitive
|
|||
|
* @returns {string} word stem
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var out = porterStemmer( 'walking' );
|
|||
|
* // returns 'walk'
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var out = porterStemmer( 'walked' );
|
|||
|
* // returns 'walk'
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var out = porterStemmer( 'walks' );
|
|||
|
* // returns 'walk'
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var out = porterStemmer( 'worldwide' );
|
|||
|
* // returns 'worldwid'
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var out = porterStemmer( '' );
|
|||
|
* // returns ''
|
|||
|
*/
|
|||
|
function porterStemmer( word ) {
|
|||
|
var firstch;
|
|||
|
var suffix;
|
|||
|
var stem;
|
|||
|
var fp;
|
|||
|
|
|||
|
if ( !isString( word ) ) {
|
|||
|
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + word + '`.' );
|
|||
|
}
|
|||
|
if ( word.length < 3 ) {
|
|||
|
return word;
|
|||
|
}
|
|||
|
word = lowercase( word );
|
|||
|
firstch = word[ 0 ];
|
|||
|
if ( firstch === 'y' ) {
|
|||
|
word = firstch.toUpperCase() + word.substr( 1 );
|
|||
|
}
|
|||
|
|
|||
|
// Step 1a:
|
|||
|
if ( RE_STEP1A.test( word ) ) {
|
|||
|
word = replace( word, RE_STEP1A, '$1$2' );
|
|||
|
} else if ( RE2_STEP1A.test( word ) ) {
|
|||
|
word = replace( word, RE2_STEP1A, '$1$2' );
|
|||
|
}
|
|||
|
|
|||
|
// Step 1b:
|
|||
|
if ( RE_STEP1B.test( word ) ) {
|
|||
|
fp = RE_STEP1B.exec( word );
|
|||
|
if ( RE_MGR0.test( fp[ 1 ] ) ) {
|
|||
|
word = replace( word, RE_LAST, '' );
|
|||
|
}
|
|||
|
} else if ( RE2_STEP1B.test( word ) ) {
|
|||
|
fp = RE2_STEP1B.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
if ( RE_SV.test( stem ) ) {
|
|||
|
word = stem;
|
|||
|
if ( RE_ATBLIZ.test( word ) ) {
|
|||
|
word += 'e';
|
|||
|
} else if ( RE_DOUBLE.test( word ) ) {
|
|||
|
word = replace( word, RE_LAST, '' );
|
|||
|
} else if ( RE_CV.test( word ) ) {
|
|||
|
word += 'e';
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Step 1c:
|
|||
|
if ( RE_STEP1C.test( word ) ) {
|
|||
|
fp = RE_STEP1C.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
if ( RE_SV.test( stem ) ) {
|
|||
|
word = stem + 'i';
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Step 2:
|
|||
|
if ( RE_STEP2.test( word ) ) {
|
|||
|
fp = RE_STEP2.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
suffix = fp[ 2 ];
|
|||
|
if ( RE_MGR0.test( stem ) ) {
|
|||
|
word = stem + step2list[ suffix ];
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Step 3:
|
|||
|
if ( RE_STEP3.test( word ) ) {
|
|||
|
fp = RE_STEP3.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
suffix = fp[ 2 ];
|
|||
|
if ( RE_MGR0.test( stem ) ) {
|
|||
|
word = stem + step3list[ suffix ];
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Step 4:
|
|||
|
if ( RE_STEP4.test( word ) ) {
|
|||
|
fp = RE_STEP4.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
if ( RE_MGR1.test( stem ) ) {
|
|||
|
word = stem;
|
|||
|
}
|
|||
|
} else if ( RE2_STEP4.test( word ) ) {
|
|||
|
fp = RE2_STEP4.exec( word );
|
|||
|
stem = fp[ 1 ] + fp[ 2 ];
|
|||
|
if ( RE_MGR1.test( stem ) ) {
|
|||
|
word = stem;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Step 5:
|
|||
|
if ( RE_STEP5.test( word ) ) {
|
|||
|
fp = RE_STEP5.exec( word );
|
|||
|
stem = fp[ 1 ];
|
|||
|
if (
|
|||
|
RE_MGR1.test( stem ) ||
|
|||
|
( RE_MEQ1.test( stem ) && !( RE_CV.test( stem ) ) )
|
|||
|
) {
|
|||
|
word = stem;
|
|||
|
}
|
|||
|
}
|
|||
|
if ( endsWith( word, 'll' ) && RE_MGR1.test( word ) ) {
|
|||
|
word = replace( word, RE_LAST, '' );
|
|||
|
}
|
|||
|
|
|||
|
// Turn initial Y back to y:
|
|||
|
if ( firstch === 'y' ) {
|
|||
|
word = firstch.toLowerCase() + word.substr( 1 );
|
|||
|
}
|
|||
|
return word;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
// EXPORTS //
|
|||
|
|
|||
|
module.exports = porterStemmer;
|