time-to-botec/squiggle/node_modules/@stdlib/nlp/porter-stemmer/lib/main.js

252 lines
6.0 KiB
JavaScript
Raw Normal View History

/**
* @license Apache-2.0
*
* Copyright (c) 2019 The Stdlib Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
* ## Notice
*
* This code is a modification of an existing JavaScript implementation of ther [Porter stemming algorithm]{@link https://tartarus.org/martin/PorterStemmer/}.
*
* ```text
* Release 1 be 'andargor', Jul 2004
* Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
* ```
*/
'use strict';
// MODULES //
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
var endsWith = require( '@stdlib/string/ends-with' );
var lowercase = require( '@stdlib/string/lowercase' );
var replace = require( '@stdlib/string/replace' );
// VARIABLES //
var step2list = {
'ational': 'ate',
'tional': 'tion',
'enci': 'ence',
'anci': 'ance',
'izer': 'ize',
'bli': 'ble',
'alli': 'al',
'entli': 'ent',
'eli': 'e',
'ousli': 'ous',
'ization': 'ize',
'ation': 'ate',
'ator': 'ate',
'alism': 'al',
'iveness': 'ive',
'fulness': 'ful',
'ousness': 'ous',
'aliti': 'al',
'iviti': 'ive',
'biliti': 'ble',
'logi': 'log'
};
var step3list = {
'icate': 'ic',
'ative': '',
'alize': 'al',
'iciti': 'ic',
'ical': 'ic',
'ful': '',
'ness': ''
};
var c = '[^aeiou]'; // consonant
var v = '[aeiouy]'; // vowel
var C = c + '[^aeiouy]*'; // consonant sequence
var V = v + '[aeiou]*'; // vowel sequence
var RE_CV = new RegExp( '^' + C + v + '[^aeiouwxy]$' );
var mgr0 = '^(' + C + ')?' + V + C; // [C]VC... is m>0
var RE_MGR0 = new RegExp( mgr0 );
var meq1 = '^(' + C + ')?' + V + C + '(' + V + ')?$'; // [C]VC[V] is m=1
var RE_MEQ1 = new RegExp( meq1 );
var mgr1 = '^(' + C + ')?' + V + C + V + C; // [C]VCVC... is m>1
var RE_MGR1 = new RegExp( mgr1 );
var sV = '^(' + C + ')?' + v; // vowel in stem
var RE_SV = new RegExp( sV );
var RE_STEP1A = /^(.+?)(ss|i)es$/;
var RE2_STEP1A = /^(.+?)([^s])s$/;
var RE_STEP1B = /^(.+?)eed$/;
var RE2_STEP1B = /^(.+?)(ed|ing)$/;
var RE_STEP1C = /^(.+?)y$/;
var RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
var RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
var RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
var RE2_STEP4 = /^(.+?)(s|t)(ion)$/;
var RE_STEP5 = /^(.+?)e$/;
var RE_LAST = /.$/;
var RE_ATBLIZ = /(at|bl|iz)$/;
var RE_DOUBLE = new RegExp( '([^aeiouylsz])\\1$' );
// MAIN //
/**
* Extracts the stem of a given word using the Porter stemming algorithm.
*
* ## References
*
* - Porter, Michael F. 1980. "An algorithm for suffix stripping." _Program_ 13 (3): 13037. doi:[10.1108/eb046814][@porter:1980].
*
* [@porter:1980]: https://doi.org/10.1108/eb046814
*
* @param {string} word - input word
* @throws {TypeError} first argument must be a string primitive
* @returns {string} word stem
*
* @example
* var out = porterStemmer( 'walking' );
* // returns 'walk'
*
* @example
* var out = porterStemmer( 'walked' );
* // returns 'walk'
*
* @example
* var out = porterStemmer( 'walks' );
* // returns 'walk'
*
* @example
* var out = porterStemmer( 'worldwide' );
* // returns 'worldwid'
*
* @example
* var out = porterStemmer( '' );
* // returns ''
*/
function porterStemmer( word ) {
var firstch;
var suffix;
var stem;
var fp;
if ( !isString( word ) ) {
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + word + '`.' );
}
if ( word.length < 3 ) {
return word;
}
word = lowercase( word );
firstch = word[ 0 ];
if ( firstch === 'y' ) {
word = firstch.toUpperCase() + word.substr( 1 );
}
// Step 1a:
if ( RE_STEP1A.test( word ) ) {
word = replace( word, RE_STEP1A, '$1$2' );
} else if ( RE2_STEP1A.test( word ) ) {
word = replace( word, RE2_STEP1A, '$1$2' );
}
// Step 1b:
if ( RE_STEP1B.test( word ) ) {
fp = RE_STEP1B.exec( word );
if ( RE_MGR0.test( fp[ 1 ] ) ) {
word = replace( word, RE_LAST, '' );
}
} else if ( RE2_STEP1B.test( word ) ) {
fp = RE2_STEP1B.exec( word );
stem = fp[ 1 ];
if ( RE_SV.test( stem ) ) {
word = stem;
if ( RE_ATBLIZ.test( word ) ) {
word += 'e';
} else if ( RE_DOUBLE.test( word ) ) {
word = replace( word, RE_LAST, '' );
} else if ( RE_CV.test( word ) ) {
word += 'e';
}
}
}
// Step 1c:
if ( RE_STEP1C.test( word ) ) {
fp = RE_STEP1C.exec( word );
stem = fp[ 1 ];
if ( RE_SV.test( stem ) ) {
word = stem + 'i';
}
}
// Step 2:
if ( RE_STEP2.test( word ) ) {
fp = RE_STEP2.exec( word );
stem = fp[ 1 ];
suffix = fp[ 2 ];
if ( RE_MGR0.test( stem ) ) {
word = stem + step2list[ suffix ];
}
}
// Step 3:
if ( RE_STEP3.test( word ) ) {
fp = RE_STEP3.exec( word );
stem = fp[ 1 ];
suffix = fp[ 2 ];
if ( RE_MGR0.test( stem ) ) {
word = stem + step3list[ suffix ];
}
}
// Step 4:
if ( RE_STEP4.test( word ) ) {
fp = RE_STEP4.exec( word );
stem = fp[ 1 ];
if ( RE_MGR1.test( stem ) ) {
word = stem;
}
} else if ( RE2_STEP4.test( word ) ) {
fp = RE2_STEP4.exec( word );
stem = fp[ 1 ] + fp[ 2 ];
if ( RE_MGR1.test( stem ) ) {
word = stem;
}
}
// Step 5:
if ( RE_STEP5.test( word ) ) {
fp = RE_STEP5.exec( word );
stem = fp[ 1 ];
if (
RE_MGR1.test( stem ) ||
( RE_MEQ1.test( stem ) && !( RE_CV.test( stem ) ) )
) {
word = stem;
}
}
if ( endsWith( word, 'll' ) && RE_MGR1.test( word ) ) {
word = replace( word, RE_LAST, '' );
}
// Turn initial Y back to y:
if ( firstch === 'y' ) {
word = firstch.toLowerCase() + word.substr( 1 );
}
return word;
}
// EXPORTS //
module.exports = porterStemmer;