loretoparisi / fasttext.js

FastText for Node.js
MIT License
192 stars 28 forks source link

I like Your code in https://github.com/huggingface/tokenizers/issues/1076 #36

Open DoctorSlimm opened 1 year ago

loretoparisi commented 1 year ago

Thanks for reference have a look at hf-tokenizers-experiments Here I have put together the whole tokenizer pipeline for SentencePiece BPE Tokenizer.

DoctorSlimm commented 1 year ago

🤩🤩🤩

DoctorSlimm commented 1 year ago

@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? { input_ids: [ 101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

let { promisify } = require("util");
let { Tokenizer } = require("tokenizers/bindings/tokenizer");

( async () => {
    const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json')
    console.log(tokenizer);

    const encode = promisify(tokenizer.encode.bind(tokenizer));
    const decode = promisify(tokenizer.decode.bind(tokenizer));

    const encoded = await encode("Hello World!");

    const modelInputs = {
        input_ids: encoded.getIds(),
        attention_mask: encoded.getAttentionMask(),
        token_type_ids: encoded.getTypeIds()
    }

    console.log(modelInputs);

})();
loretoparisi commented 1 year ago

@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? ```{

input_ids: [

101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   ```

let { promisify } = require("util");

let { Tokenizer } = require("tokenizers/bindings/tokenizer");

( async () => {

    const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json')

    console.log(tokenizer);

    const encode = promisify(tokenizer.encode.bind(tokenizer));

    const decode = promisify(tokenizer.decode.bind(tokenizer));

    const encoded = await encode("Hello World!");

    const modelInputs = {

        input_ids: encoded.getIds(),

        attention_mask: encoded.getAttentionMask(),

        token_type_ids: encoded.getTypeIds()

    }

    console.log(modelInputs);

})();

Have a look to my examples here

you will find that there is an option to pad to max length the input:

var lpTokenizer = await LPSentencePieceBPETokenizer.fromOptions({
        padMaxLength: false,
        vocabFile: "../vocab/minilm/minilm-vocab.json"
        , mergesFile: "../vocab/minilm/minilm-merges.txt"
    });

this is necessary to feed the model with the correct (fixed) size (typically the max sequence size of the input).