explosion / sense2vec

🦆 Contextually-keyed word vectors
https://explosion.ai/blog/sense2vec-reloaded
MIT License
1.62k stars 240 forks source link

Hundreds of thousands of files when training new vectors #134

Open danielmoore19 opened 3 years ago

danielmoore19 commented 3 years ago

The shift from

@plac.annotations

to using typer seems to be ignoring the max_docs argument. it is set at 10 ** 6, but i am getting a new file for each individual line when training vectors. even when adding --max-docs 1000000 to my code it is still going line by line instead of creating batches.

here is the code from 01_parse.py

def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"),
    n_process: int = typer.Option(1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    max_docs: int = typer.Option(10 ** 6, "--max-docs", "-m", help="Maximum docs per batch"),  <-- this isn't working it seems
    # fmt: on
):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        with output_file.open("wb") as f:
            batch_num += 1
            output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(
                f"Complete. Saved final parsed docs to file", output_file.resolve()
            )

if __name__ == "__main__":
    typer.run(main)