188 lines
4.1 KiB
JavaScript
188 lines
4.1 KiB
JavaScript
/**
|
||
* @license Apache-2.0
|
||
*
|
||
* Copyright (c) 2018 The Stdlib Authors.
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
'use strict';
|
||
|
||
// MODULES //
|
||
|
||
var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive;
|
||
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
|
||
var hasOwnProp = require( '@stdlib/assert/has-own-property' );
|
||
var ABBRS = require( './abbreviations.json' );
|
||
var EMOJIS = require( './emojis.json' );
|
||
var CONTRACT = require( './contractions.json' );
|
||
|
||
|
||
// VARIABLES //
|
||
|
||
var REGEXP_PREFIXES = /^([,([{*<"“'`‘.])/gi;
|
||
var REGEXP_SUFFIXES = /([,.!?%*>:;"'”`)\]}])$/gi;
|
||
|
||
|
||
// FUNCTIONS //
|
||
|
||
/**
|
||
* Extends an array by the elements of another array.
|
||
*
|
||
* @private
|
||
* @param {Array} arr - input array
|
||
* @param {Array} ext - array to extend `arr` with
|
||
* @returns {Array} mutated input array
|
||
*
|
||
* @example
|
||
* var arr = [ 1, 2, 3 ];
|
||
* var out = extend( arr, [ 4, 5 ] );
|
||
* // returns [ 1, 2, 3, 4, 5 ]
|
||
*/
|
||
function extend( arr, ext ) {
|
||
var i;
|
||
for ( i = 0; i < ext.length; i++ ) {
|
||
arr.push( ext[ i ] );
|
||
}
|
||
return arr;
|
||
}
|
||
|
||
/**
|
||
* Tokenizes a substring.
|
||
*
|
||
* @private
|
||
* @param {string} substr - input string
|
||
* @returns {Array} token array
|
||
*
|
||
* @example
|
||
* var str = '(never)';
|
||
* var out = tokenizeSubstring( str );
|
||
* // returns [ '(', 'never', ')' ]
|
||
*/
|
||
function tokenizeSubstring( substr ) {
|
||
var prefixes = [];
|
||
var suffixes = [];
|
||
var match;
|
||
var done;
|
||
var res;
|
||
|
||
do {
|
||
if (
|
||
!EMOJIS[ substr ] &&
|
||
!ABBRS[ substr ] &&
|
||
!CONTRACT[ substr ]
|
||
) {
|
||
match = substr.split( REGEXP_PREFIXES );
|
||
if ( match.length > 1 ) {
|
||
prefixes.push( match[ 1 ] );
|
||
substr = match[ 2 ];
|
||
}
|
||
else {
|
||
match = substr.split( REGEXP_SUFFIXES );
|
||
if ( match.length > 1 ) {
|
||
substr = match[ 0 ];
|
||
suffixes.push( match[ 1 ] );
|
||
} else {
|
||
done = true;
|
||
}
|
||
}
|
||
}
|
||
else {
|
||
done = true;
|
||
}
|
||
} while ( !done );
|
||
|
||
res = prefixes;
|
||
res.push( substr );
|
||
extend( res, suffixes );
|
||
return res;
|
||
}
|
||
|
||
|
||
// MAIN //
|
||
|
||
/**
|
||
* Tokenize a string.
|
||
*
|
||
* @param {string} str - input string
|
||
* @param {boolean} [keepWhitespace=false] - boolean indicating whether whitespace characters should be returned as part of the token array
|
||
* @throws {TypeError} first argument must be a string primitive
|
||
* @throws {TypeError} second argument must be a boolean primitive
|
||
* @returns {Array} array of tokens
|
||
*
|
||
* @example
|
||
* var str = 'Hello World!';
|
||
* var out = tokenize( str );
|
||
* // returns [ 'Hello', 'World', '!' ]
|
||
*
|
||
* @example
|
||
* var str = '';
|
||
* var out = tokenize( str );
|
||
* // returns []
|
||
*
|
||
* @example
|
||
* var str = 'Hello Mrs. Maple, could you call me back?';
|
||
* var out = tokenize( str );
|
||
* // returns [ 'Hello', 'Mrs.', 'Maple', ',', 'could', 'you', 'call', 'me', 'back', '?' ]
|
||
*/
|
||
function tokenize( str, keepWhitespace ) {
|
||
var subtkns;
|
||
var substrs;
|
||
var tokens;
|
||
var substr;
|
||
var cache;
|
||
var i;
|
||
if ( !isString( str ) ) {
|
||
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + str + '`.' );
|
||
}
|
||
if ( arguments.length > 1 ) {
|
||
if ( !isBoolean( keepWhitespace ) ) {
|
||
throw new TypeError( 'invalid argument. Second argument must be a boolean primitive. Value: `' + keepWhitespace + '`.' );
|
||
}
|
||
}
|
||
if ( !str ) {
|
||
return [];
|
||
}
|
||
|
||
// Split on whitespace:
|
||
if ( keepWhitespace ) {
|
||
substrs = str.split( /(\s+)/ );
|
||
} else {
|
||
substrs = str.split( /\s+/ );
|
||
}
|
||
|
||
// Set up cache to hold tokens for substring matches:
|
||
cache = {};
|
||
|
||
// Initialize token array:
|
||
tokens = [];
|
||
|
||
for ( i = 0; i < substrs.length; i++ ) {
|
||
substr = substrs[ i ];
|
||
if ( hasOwnProp( cache, substr ) ) {
|
||
extend( tokens, cache[ substr ] );
|
||
}
|
||
else {
|
||
subtkns = tokenizeSubstring( substr );
|
||
extend( tokens, subtkns );
|
||
cache[ substr ] = subtkns;
|
||
}
|
||
}
|
||
return tokens;
|
||
}
|
||
|
||
|
||
// EXPORTS //
|
||
|
||
module.exports = tokenize;
|