188 lines
4.1 KiB
JavaScript
188 lines
4.1 KiB
JavaScript
|
/**
|
|||
|
* @license Apache-2.0
|
|||
|
*
|
|||
|
* Copyright (c) 2018 The Stdlib Authors.
|
|||
|
*
|
|||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|||
|
* you may not use this file except in compliance with the License.
|
|||
|
* You may obtain a copy of the License at
|
|||
|
*
|
|||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
*
|
|||
|
* Unless required by applicable law or agreed to in writing, software
|
|||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
|
* See the License for the specific language governing permissions and
|
|||
|
* limitations under the License.
|
|||
|
*/
|
|||
|
|
|||
|
'use strict';
|
|||
|
|
|||
|
// MODULES //
|
|||
|
|
|||
|
var isBoolean = require( '@stdlib/assert/is-boolean' ).isPrimitive;
|
|||
|
var isString = require( '@stdlib/assert/is-string' ).isPrimitive;
|
|||
|
var hasOwnProp = require( '@stdlib/assert/has-own-property' );
|
|||
|
var ABBRS = require( './abbreviations.json' );
|
|||
|
var EMOJIS = require( './emojis.json' );
|
|||
|
var CONTRACT = require( './contractions.json' );
|
|||
|
|
|||
|
|
|||
|
// VARIABLES //
|
|||
|
|
|||
|
var REGEXP_PREFIXES = /^([,([{*<"“'`‘.])/gi;
|
|||
|
var REGEXP_SUFFIXES = /([,.!?%*>:;"'”`)\]}])$/gi;
|
|||
|
|
|||
|
|
|||
|
// FUNCTIONS //
|
|||
|
|
|||
|
/**
|
|||
|
* Extends an array by the elements of another array.
|
|||
|
*
|
|||
|
* @private
|
|||
|
* @param {Array} arr - input array
|
|||
|
* @param {Array} ext - array to extend `arr` with
|
|||
|
* @returns {Array} mutated input array
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var arr = [ 1, 2, 3 ];
|
|||
|
* var out = extend( arr, [ 4, 5 ] );
|
|||
|
* // returns [ 1, 2, 3, 4, 5 ]
|
|||
|
*/
|
|||
|
function extend( arr, ext ) {
|
|||
|
var i;
|
|||
|
for ( i = 0; i < ext.length; i++ ) {
|
|||
|
arr.push( ext[ i ] );
|
|||
|
}
|
|||
|
return arr;
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Tokenizes a substring.
|
|||
|
*
|
|||
|
* @private
|
|||
|
* @param {string} substr - input string
|
|||
|
* @returns {Array} token array
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var str = '(never)';
|
|||
|
* var out = tokenizeSubstring( str );
|
|||
|
* // returns [ '(', 'never', ')' ]
|
|||
|
*/
|
|||
|
function tokenizeSubstring( substr ) {
|
|||
|
var prefixes = [];
|
|||
|
var suffixes = [];
|
|||
|
var match;
|
|||
|
var done;
|
|||
|
var res;
|
|||
|
|
|||
|
do {
|
|||
|
if (
|
|||
|
!EMOJIS[ substr ] &&
|
|||
|
!ABBRS[ substr ] &&
|
|||
|
!CONTRACT[ substr ]
|
|||
|
) {
|
|||
|
match = substr.split( REGEXP_PREFIXES );
|
|||
|
if ( match.length > 1 ) {
|
|||
|
prefixes.push( match[ 1 ] );
|
|||
|
substr = match[ 2 ];
|
|||
|
}
|
|||
|
else {
|
|||
|
match = substr.split( REGEXP_SUFFIXES );
|
|||
|
if ( match.length > 1 ) {
|
|||
|
substr = match[ 0 ];
|
|||
|
suffixes.push( match[ 1 ] );
|
|||
|
} else {
|
|||
|
done = true;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
else {
|
|||
|
done = true;
|
|||
|
}
|
|||
|
} while ( !done );
|
|||
|
|
|||
|
res = prefixes;
|
|||
|
res.push( substr );
|
|||
|
extend( res, suffixes );
|
|||
|
return res;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
// MAIN //
|
|||
|
|
|||
|
/**
|
|||
|
* Tokenize a string.
|
|||
|
*
|
|||
|
* @param {string} str - input string
|
|||
|
* @param {boolean} [keepWhitespace=false] - boolean indicating whether whitespace characters should be returned as part of the token array
|
|||
|
* @throws {TypeError} first argument must be a string primitive
|
|||
|
* @throws {TypeError} second argument must be a boolean primitive
|
|||
|
* @returns {Array} array of tokens
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var str = 'Hello World!';
|
|||
|
* var out = tokenize( str );
|
|||
|
* // returns [ 'Hello', 'World', '!' ]
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var str = '';
|
|||
|
* var out = tokenize( str );
|
|||
|
* // returns []
|
|||
|
*
|
|||
|
* @example
|
|||
|
* var str = 'Hello Mrs. Maple, could you call me back?';
|
|||
|
* var out = tokenize( str );
|
|||
|
* // returns [ 'Hello', 'Mrs.', 'Maple', ',', 'could', 'you', 'call', 'me', 'back', '?' ]
|
|||
|
*/
|
|||
|
function tokenize( str, keepWhitespace ) {
|
|||
|
var subtkns;
|
|||
|
var substrs;
|
|||
|
var tokens;
|
|||
|
var substr;
|
|||
|
var cache;
|
|||
|
var i;
|
|||
|
if ( !isString( str ) ) {
|
|||
|
throw new TypeError( 'invalid argument. First argument must be a string primitive. Value: `' + str + '`.' );
|
|||
|
}
|
|||
|
if ( arguments.length > 1 ) {
|
|||
|
if ( !isBoolean( keepWhitespace ) ) {
|
|||
|
throw new TypeError( 'invalid argument. Second argument must be a boolean primitive. Value: `' + keepWhitespace + '`.' );
|
|||
|
}
|
|||
|
}
|
|||
|
if ( !str ) {
|
|||
|
return [];
|
|||
|
}
|
|||
|
|
|||
|
// Split on whitespace:
|
|||
|
if ( keepWhitespace ) {
|
|||
|
substrs = str.split( /(\s+)/ );
|
|||
|
} else {
|
|||
|
substrs = str.split( /\s+/ );
|
|||
|
}
|
|||
|
|
|||
|
// Set up cache to hold tokens for substring matches:
|
|||
|
cache = {};
|
|||
|
|
|||
|
// Initialize token array:
|
|||
|
tokens = [];
|
|||
|
|
|||
|
for ( i = 0; i < substrs.length; i++ ) {
|
|||
|
substr = substrs[ i ];
|
|||
|
if ( hasOwnProp( cache, substr ) ) {
|
|||
|
extend( tokens, cache[ substr ] );
|
|||
|
}
|
|||
|
else {
|
|||
|
subtkns = tokenizeSubstring( substr );
|
|||
|
extend( tokens, subtkns );
|
|||
|
cache[ substr ] = subtkns;
|
|||
|
}
|
|||
|
}
|
|||
|
return tokens;
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
// EXPORTS //
|
|||
|
|
|||
|
module.exports = tokenize;
|