Open dwhieb opened 5 years ago
The tokenize method:
/**
* Removes punctuation from a string, and then tokenizes the string, returning an array of tokens.
* @param {String} string The string to tokenize
* @param {Array} delimiters An array of characters to tokenize the string with
* @param {Array} punctuation An array of punctuation characters to remove from the string
* @return {Array} Returns an array of tokens
* @instance
*/
function tokenize(string, delimiters, punctuation) {
// NB: Don`t use default parameters, since they don`t allow `null` to be passed.
const delims = delimiters || [`\\s`];
const punct = punctuation || defaultPunctuation;
if (typeof string !== `string`) {
throw new TypeError(`The string argument must be a string.`);
}
if (!(Array.isArray(delims) && Array.isArray(punct))) {
throw new TypeError(`The delimiters and punctuation arguments must be arrays.`);
}
const substitutions = {};
punct.forEach(item => { substitutions[item] = ``; });
const depunctuated = sanitize(string, substitutions);
const pattern = `[${delims.join(``)}]+`;
const regexp = new RegExp(pattern, `gu`);
return depunctuated.split(regexp);
}
The Tokenizer class:
/**
* A Tokenizer class that saves a set of delimiters and punctuation characters for repeated use in tokenizing strings
* @type {Function}
* @instance
* @prop {Array} delimiters An array of delimiters to tokenize with
* @prop {Array} punctuation An array of punctuation characters to strip from the string
*/
class Tokenizer {
/**
* Create a new Tokenizer
* @param {Array} delimiters The Array of delimiters used by this Tokenizer
* @param {Array} punctuation The Array of punctuation characters used by this Tokenizer
* @return {Function} Returns a tokenize function which tokenizes a string with the provided set of delimiters and punctuation characters
*/
constructor(delimiters, punctuation) {
this.delimiters = delimiters;
this.punctuation = punctuation;
return string => tokenize(string, this.delimiters, this.punctuation);
}
}
Tests:
const tokenizationString = `Hello! Are you my mother?`;
describe(`tokenize`, function() {
const { tokenize } = utils;
it(`uses default punctuation`, function() {
const tokens = tokenize(tokenizationString);
expect(tokens.length).toBe(5);
expect(tokens[0]).toBe(`Hello`);
expect(tokens[4]).toBe(`mother`);
});
it(`removes punctuation`, function() {
const depunctuated = tokenize(tokenizationString, [], [`!`, `?`]);
const hasPunctuation = depunctuated.some(str => str.includes(`!`) || str.includes(`?`));
expect(depunctuated.length).toBe(1);
expect(depunctuated[0]).toBe(`Hello Are you my mother`);
expect(hasPunctuation).toBe(false);
});
it(`tokenizes correctly`, function() {
expect(tokenize(tokenizationString, [` `], [`!`, `?`]).length).toBe(5);
});
});
it(`Tokenizer`, function() {
const { Tokenizer } = utils;
const tokenize = new Tokenizer([], [`!`, `?`]);
expect(tokenize(tokenizationString)[0]).toBe(`Hello Are you my mother`);
});
The sanitize
method:
/**
* Performs a series of substitutions on a string, replacing unwanted characters with the desired ones (or nothing, meaning the character is simply removed everywhere it occurs).
* @param {String} string The string to sanitize
* @param {Object} substitutions An object whose attributes are the characters you wish to replace, and values are the characters you wish to replace them with
* @return {String} Returns the sanitized string, with substitutions made
* @instance
*/
function sanitize(string, substitutions) {
return transliterate(string, substitutions);
}
The Sanitizer
class:
/**
* A Sanitizer class that stores a set of substitutions for repeated use on different strings
* @type {Function}
* @instance
* @prop {Object} substitutions An Object containing the substitutions to apply to each string
*/
class Sanitizer {
/**
* Create a new Sanitizer function
* @param {Object} substitutions The set of substitutions to make (see the `sanitize` method)
* @return {Function} Returns a function which always sanitizes a string with the given set of substitutions
*/
constructor(substitutions) {
this.substitutions = substitutions;
return string => sanitize(string, substitutions);
}
}
Make this a dependency, with its own library.