time-to-botec/squiggle/node_modules/@stdlib/nlp/tokenize/lib/tokenize.js

188 lines
4.1 KiB
JavaScript
Raw Normal View History

/**
* @license Apache-2.0
*
* Copyright (c) 2018 The Stdlib Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
'use strict';
// MODULES //
var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive;
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
var hasOwnProp = require( '@stdlib/assert/has-own-property' );
var ABBRS = require( './abbreviations.json' );
var EMOJIS = require( './emojis.json' );
var CONTRACT = require( './contractions.json' );
// VARIABLES //
var REGEXP_PREFIXES = /^([,([{*<"“'`.])/gi;
var REGEXP_SUFFIXES = /([,.!?%*>:;"'”`)\]}])$/gi;
// FUNCTIONS //
/**
* Extends an array by the elements of another array.
*
* @private
* @param {Array} arr - input array
* @param {Array} ext - array to extend `arr` with
* @returns {Array} mutated input array
*
* @example
* var arr = [ 1, 2, 3 ];
* var out = extend( arr, [ 4, 5 ] );
* // returns [ 1, 2, 3, 4, 5 ]
*/
function extend( arr, ext ) {
var i;
for ( i = 0; i < ext.length; i++ ) {
arr.push( ext[ i ] );
}
return arr;
}
/**
* Tokenizes a substring.
*
* @private
* @param {string} substr - input string
* @returns {Array} token array
*
* @example
* var str = '(never)';
* var out = tokenizeSubstring( str );
* // returns [ '(', 'never', ')' ]
*/
function tokenizeSubstring( substr ) {
var prefixes = [];
var suffixes = [];
var match;
var done;
var res;
do {
if (
!EMOJIS[ substr ] &&
!ABBRS[ substr ] &&
!CONTRACT[ substr ]
) {
match = substr.split( REGEXP_PREFIXES );
if ( match.length > 1 ) {
prefixes.push( match[ 1 ] );
substr = match[ 2 ];
}
else {
match = substr.split( REGEXP_SUFFIXES );
if ( match.length > 1 ) {
substr = match[ 0 ];
suffixes.push( match[ 1 ] );
} else {
done = true;
}
}
}
else {
done = true;
}
} while ( !done );
res = prefixes;
res.push( substr );
extend( res, suffixes );
return res;
}
// MAIN //
/**
* Tokenize a string.
*
* @param {string} str - input string
* @param {boolean} [keepWhitespace=false] - boolean indicating whether whitespace characters should be returned as part of the token array
* @throws {TypeError} first argument must be a string primitive
* @throws {TypeError} second argument must be a boolean primitive
* @returns {Array} array of tokens
*
* @example
* var str = 'Hello World!';
* var out = tokenize( str );
* // returns [ 'Hello', 'World', '!' ]
*
* @example
* var str = '';
* var out = tokenize( str );
* // returns []
*
* @example
* var str = 'Hello Mrs. Maple, could you call me back?';
* var out = tokenize( str );
* // returns [ 'Hello', 'Mrs.', 'Maple', ',', 'could', 'you', 'call', 'me', 'back', '?' ]
*/
function tokenize( str, keepWhitespace ) {
var subtkns;
var substrs;
var tokens;
var substr;
var cache;
var i;
if ( !isString( str ) ) {
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + str + '`.' );
}
if ( arguments.length > 1 ) {
if ( !isBoolean( keepWhitespace ) ) {
throw new TypeError( 'invalid argument. Second argument must be a boolean primitive. Value: `' + keepWhitespace + '`.' );
}
}
if ( !str ) {
return [];
}
// Split on whitespace:
if ( keepWhitespace ) {
substrs = str.split( /(\s+)/ );
} else {
substrs = str.split( /\s+/ );
}
// Set up cache to hold tokens for substring matches:
cache = {};
// Initialize token array:
tokens = [];
for ( i = 0; i < substrs.length; i++ ) {
substr = substrs[ i ];
if ( hasOwnProp( cache, substr ) ) {
extend( tokens, cache[ substr ] );
}
else {
subtkns = tokenizeSubstring( substr );
extend( tokens, subtkns );
cache[ substr ] = subtkns;
}
}
return tokens;
}
// EXPORTS //
module.exports = tokenize;