spencermountain / compromise

modest natural-language processing
http://compromise.cool
MIT License
11.39k stars 654 forks source link

values.toApproximate #490

Open ohenepee opened 6 years ago

ohenepee commented 6 years ago

Could you please add .quantify(list) as in Pattern.en. Again a WordNet lookup with .meet(noun1,noun2) as in NodeBox Linguistics.

Two sweet and essential missing NLP pieces. Nice job by the way... graduallly making JS great again! :)

spencermountain commented 6 years ago

hi @ohenepee thanks. I don't quite understand the request, can you elaborate on what quantify would do? cheers

ohenepee commented 6 years ago

quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])

several chickens, a pair of geese an⁣d a duck

quantify({'carrot': 100, 'parrot': 20})

dozens of carrots an⁣d a score of parrots

quantify('carrot', 1000)

hundreds of carrots

As found here

ohenepee commented 6 years ago

For the love I have of compromise. This is 99% port of the Python implementation found in Pattern.en. The only thing missing is the plural dictionary which I had no use for. I guess someone can add it.

@spencermountain Sorry I couldn't submit a pull request.

// QUANTIFY: Approximates quantities of objects ("dozens of chickens" etc.)

const inflection = require("inflection") // should be replaced with compromise
const Articles = require("articles") // should be replaced with compromise

const NUMERALS_VERBOSE = {
    "half"  : ( 1, 0.5),
    "dozen" : (12, 0.0),
    "score" : (20, 0.0)
}

const ORDER = ["hundred", "thousand"].concat([
    "m", 
    "b", 
    "tr", 
    "quadr", 
    "quint", 
    "sext", 
    "sept", 
    "oct", 
    "non", 
    "dec", 
    "undec", 
    "duodec", 
    "tredec", 
    "quattuordec", 
    "quindec", 
    "sexdec", 
    "septemdec", 
    "octodec", 
    "novemdec", 
    "vigint"
].map(m => m+"illion"))

// {"hundred": 100, "thousand": 1000, ...}
var O = {}
O[ORDER[0]] = 100 
O[ORDER[1]] = 1000

for (let i = 2; i < ORDER.length; i++) {
    O[ORDER[i]] = 1000000 * 1000 ** (i-2)
}

const [ZERO, MINUS, RADIX, THOUSANDS, CONJUNCTION] = ["zero", "minus", "point", ",", "and"]

// APPROXIMATE

const NONE      = "no"          //  0
const PAIR      = "a pair of"   //  2
const SEVERAL   = "several"     //  3-7
const NUMBER    = "a number of" //  8-17
const SCORE     = "a score of"  // 18-22
const DOZENS    = "dozens of"   // 22-200
const COUNTLESS = "countless"

function approximate(word, amount=1, plural) {
    /*  Returns an approximation of the number of given objects.
        Two objects are described as being "a pair",
        smaller than eight is "several",
        smaller than twenty is "a number of",
        smaller than two hundred are "dozens",
        anything bigger is described as being tens or hundreds of thousands or millions.
        For example: approximate("chicken", 100) => "dozens of chickens".
    */
    let p

    try {
        p = inflection.pluralize(word, plural)
    } catch (e) {
        return console.error(e)
    }

    // Anything up to 200.
    if (amount == 0) 
        return `${NONE} ${p}`
    if (amount == 1) 
        return Articles.articlize(word) // "a" chicken, "an" elephant
    if (amount == 2) 
        return `${PAIR} ${p}`
    if (3 <= amount && amount < 8) 
        return `${SEVERAL} ${p}`
    if (8 <= amount && amount < 18) 
        return `${NUMBER} ${p}`
    if (18 <= amount && amount < 23) 
        return `${SCORE} ${p}`
    if (23 <= amount && amount < 200) 
        return `${DOZENS} ${p}`
    if (amount > 10000000)
        return `${COUNTLESS} ${p}`

    // Hundreds and thousands.
    let thousands = parseInt((Math.log(amount) / Math.log(10)) / 3)
    let hundreds  = Math.ceil((Math.log(amount) / Math.log(10)) % 3) - 1
    let h = hundreds == 2 && "hundreds of " || (hundreds == 1 && "tens of " || "")
    let t = thousands > 0 && inflection.pluralize(ORDER[thousands]) + " of " || ""
    return `${h}${t}${p}`
}

// approximate("chicken", 0)
// approximate("chicken", 3)
// approximate("chicken", 10000)

// QUANTIFY
// quantify(word, amount, plural={})
// quantify([word1, word2, ...], plural={})
// quantify({word1:0, word2:0, ...}, plural={})
function quantify(...args) {
    /*  Returns an approximation of the entire set.
        Identical words are grouped and counted and then quantified with an approximation.
    */
    let count = {}

    if (args.length == 2 && (typeof args[0] == "string") && (typeof args[1] == "number")) {

        return approximate(args[0], args[1])

    } else if (args[0] instanceof Array && args.length > 0) { // arguments is an Array

        // Keep a count of each item in the list.
        for (let word of args[0]) {
            if (typeof word != "string") {
                return console.error(new TypeError(`can't count ${word} (not a string)`))
            }
            if (!(word in count)) {
                count[word] = 0
            }
            count[word] += 1
        }

    } else if ((!(args[0] instanceof Array) && args[0] instanceof Object) && args.length == 1) { // argument is a dictionary

        count = args[0]

    }

    console.log(count)

    // Create an iterator of (count, item) arrays, sorted highest-first.
    let s = Object.entries(count)
    s = Math.max(...[].concat(s.map(n => n[1]))) > 1 && s.sort((a, b) => a[1] - b[1]).reverse() || s

    // Concatenate approximate quantities of each item,
    // starting with the one that has the highest occurrence.
    let phrase = []
    for (let [i, [word, n]] of s.entries()) {
        phrase.push(approximate(word, n))
        phrase.push(i == s.length - 2 && ", and " || ", ")
    }
    return phrase.slice(0,-1).join("")
}

// quantify(["goose", "goose", "duck", "chicken", "chicken", "chicken"])
// quantify(["penguin", "polar bear"])
// quantify(["whale"])
spencermountain commented 6 years ago

thanks, yeah this is a cool idea. What, if you don't mind, are you using it for?

I would see this feature implemented something like this:

doc=nlp('27 seven thousand rats')
doc.values().toApprox()
doc.out()
// 'several thousand rats'

something like that?

ohenepee commented 6 years ago

I'm building a bot that can generate meaningful response (Natural Language Generation). Your method could be an addition, but the previous methods I've presented would help in generating natural language from database values (multiple database values to be precise).

I mean generating from a dictionary or key-value object... as well as an item and its amount. I also do love the toApproximate() however is that more descriptive than quantify() ?