digitallinguistics / javascript

A JavaScript library for working with linguistic data in DLx format
https://developer.digitallinguistics.io/javascript
MIT License
2 stars 0 forks source link

add tokenize utility #75

Open dwhieb opened 5 years ago

dwhieb commented 5 years ago

Make this a dependency, with its own library.

dwhieb commented 5 years ago

The tokenize method:

/**
 * Removes punctuation from a string, and then tokenizes the string, returning an array of tokens.
 * @param  {String} string      The string to tokenize
 * @param  {Array}  delimiters  An array of characters to tokenize the string with
 * @param  {Array}  punctuation An array of punctuation characters to remove from the string
 * @return {Array}              Returns an array of tokens
 * @instance
 */
function tokenize(string, delimiters, punctuation) {

  // NB: Don`t use default parameters, since they don`t allow `null` to be passed.
  const delims = delimiters || [`\\s`];
  const punct  = punctuation || defaultPunctuation;

  if (typeof string !== `string`) {
    throw new TypeError(`The string argument must be a string.`);
  }

  if (!(Array.isArray(delims) && Array.isArray(punct))) {
    throw new TypeError(`The delimiters and punctuation arguments must be arrays.`);
  }

  const substitutions = {};

  punct.forEach(item => { substitutions[item] = ``; });

  const depunctuated = sanitize(string, substitutions);
  const pattern      = `[${delims.join(``)}]+`;
  const regexp       = new RegExp(pattern, `gu`);

  return depunctuated.split(regexp);

}
dwhieb commented 5 years ago

The Tokenizer class:

/**
 * A Tokenizer class that saves a set of delimiters and punctuation characters for repeated use in tokenizing strings
 * @type {Function}
 * @instance
 * @prop {Array} delimiters  An array of delimiters to tokenize with
 * @prop {Array} punctuation An array of punctuation characters to strip from the string
 */
class Tokenizer {
  /**
   * Create a new Tokenizer
   * @param  {Array}    delimiters  The Array of delimiters used by this Tokenizer
   * @param  {Array}    punctuation The Array of punctuation characters used by this Tokenizer
   * @return {Function}             Returns a tokenize function which tokenizes a string with the provided set of delimiters and punctuation characters
   */
  constructor(delimiters, punctuation) {
    this.delimiters  = delimiters;
    this.punctuation = punctuation;
    return string => tokenize(string, this.delimiters, this.punctuation);
  }
}
dwhieb commented 5 years ago

Tests:

    const tokenizationString = `Hello! Are you my mother?`;

    describe(`tokenize`, function() {

      const { tokenize } = utils;

      it(`uses default punctuation`, function() {
        const tokens = tokenize(tokenizationString);
        expect(tokens.length).toBe(5);
        expect(tokens[0]).toBe(`Hello`);
        expect(tokens[4]).toBe(`mother`);
      });

      it(`removes punctuation`, function() {
        const depunctuated = tokenize(tokenizationString, [], [`!`, `?`]);
        const hasPunctuation = depunctuated.some(str => str.includes(`!`) || str.includes(`?`));
        expect(depunctuated.length).toBe(1);
        expect(depunctuated[0]).toBe(`Hello Are you my mother`);
        expect(hasPunctuation).toBe(false);
      });

      it(`tokenizes correctly`, function() {
        expect(tokenize(tokenizationString, [` `], [`!`, `?`]).length).toBe(5);
      });

    });

    it(`Tokenizer`, function() {
      const { Tokenizer } = utils;
      const tokenize = new Tokenizer([], [`!`, `?`]);
      expect(tokenize(tokenizationString)[0]).toBe(`Hello Are you my mother`);
    });
dwhieb commented 4 years ago

The sanitize method:

/**
 * Performs a series of substitutions on a string, replacing unwanted characters with the desired ones (or nothing, meaning the character is simply removed everywhere it occurs).
 * @param  {String} string        The string to sanitize
 * @param  {Object} substitutions An object whose attributes are the characters you wish to replace, and values are the characters you wish to replace them with
 * @return {String}               Returns the sanitized string, with substitutions made
 * @instance
 */
function sanitize(string, substitutions) {
  return transliterate(string, substitutions);
}
dwhieb commented 4 years ago

The Sanitizer class:

/**
 * A Sanitizer class that stores a set of substitutions for repeated use on different strings
 * @type {Function}
 * @instance
 * @prop {Object} substitutions An Object containing the substitutions to apply to each string
 */
class Sanitizer {
  /**
   * Create a new Sanitizer function
   * @param  {Object} substitutions The set of substitutions to make (see the `sanitize` method)
   * @return {Function}             Returns a function which always sanitizes a string with the given set of substitutions
   */
  constructor(substitutions) {
    this.substitutions = substitutions;
    return string => sanitize(string, substitutions);
  }
}