explosion / spaCy

💫 Industrial-strength Natural Language Processing (NLP) in Python
https://spacy.io
MIT License
29.69k stars 4.36k forks source link

Memory error: Unable to allocate 18.4 MiB for an array with shape (50178, 96) and data type int32 #10623

Closed adityasharma1076e closed 2 years ago

adityasharma1076e commented 2 years ago

How to reproduce the behaviour

My objective is to train a document classification model but I am facing memory issues. Data consists of 78200 training samples and 26067 dev samples with total number of unique labels = 13. Average number of tokens present in each sample is 2000 tokens. These samples are converted to spacy docbin format after preprocessing for training.

The model failed to train even on 64 GB sagemaker system after failing on lower memory systems with 100% memory usage and lowering batch size to nlp.batch_size = 4 and batcher.size = 1,

Training :

python -m spacy train configs.cfg --paths.train train.spacy --paths.dev dev.spacy 

Training Config

[paths]
train = null
dev = null
vectors = "en_core_web_lg"
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","textcat"]
batch_size = 4
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.textcat]
factory = "textcat"
scorer = {"@scorers":"spacy.textcat_scorer.v1"}
threshold = 0.5

[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v2"
nO = null

[components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
nO = null

[components.textcat.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["ORTH","SHAPE"]
rows = [5000,2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
dropout = 0.2
accumulate_gradient = 1
patience = 0
max_epochs = 10
max_steps = 0
eval_frequency = 20000
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_sequence.v1"
size = 1
get_length = null

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
cats_score = 1.0
cats_score_desc = null
cats_micro_p = null
cats_micro_r = null
cats_micro_f = null
cats_macro_p = null
cats_macro_r = null
cats_macro_f = null
cats_macro_auc = null
cats_f_per_type = null
cats_macro_auc_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Error Logs

MemoryError((50178, 96), dtype('int32'))#033

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 122, in train
    raise e
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 105, in train
    for batch, info, is_best_checkpoint in training_step_iterator:
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 226, in train_while_improving
    score, other_scores = evaluate()
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 281, in evaluate
    scores = nlp.evaluate(dev_corpus(nlp))
  File "/usr/local/lib/python3.6/dist-packages/spacy/language.py", line 1415, in evaluate
    for eg, doc in zip(examples, docs):
  File "/usr/local/lib/python3.6/dist-packages/spacy/language.py", line 1575, in pipe
    for doc in docs:
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1598, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/trainable_pipe.pyx", line 73, in pipe
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1547, in minibatch
    batch = list(itertools.islice(items, int(batch_size)))
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1598, in _pipe
    yield from proc.pipe(docs, **kwargs)
  File "spacy/pipeline/trainable_pipe.pyx", line 79, in pipe
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1617, in raise_error
    raise e
  File "spacy/pipeline/trainable_pipe.pyx", line 75, in spacy.pipeline.trainable_pipe.TrainablePipe.pipe
  File "/usr/local/lib/python3.6/dist-packages/spacy/pipeline/tok2vec.py", line 125, in predict
    tokvecs = self.model.predict(docs)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 315, in predict
    return self._func(self, X, is_train=False)[0]
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/chain.py", line 54, in forward
    Y, inc_layer_grad = layer(X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/with_array.py", line 40, in forward
    return _list_forward(cast(Model[List2d, List2d], model), Xseq, is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/with_array.py", line 76, in _list_forward
    Yf, get_dXf = layer(Xf, is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/chain.py", line 54, in forward
    Y, inc_layer_grad = layer(X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/residual.py", line 40, in forward
    Y, backprop_layer = model.layers[0](X, is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/chain.py", line 54, in forward
    Y, inc_layer_grad = layer(X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/chain.py", line 54, in forward
    Y, inc_layer_grad = layer(X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/chain.py", line 54, in forward
    Y, inc_layer_grad = layer(X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 291, in __call__
    return self._func(self, X, is_train=is_train)
  File "/usr/local/lib/python3.6/dist-packages/thinc/layers/maxout.py", line 52, in forward
    best, which = model.ops.maxout(Z)
  File "thinc/backends/numpy_ops.pyx", line 151, in thinc.backends.numpy_ops.NumpyOps.maxout

numpy.core._exceptions.MemoryError: Unable to allocate 18.4 MiB for an array with shape (50178, 96) and data type int32

During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.6/dist-packages/spacy/cli/_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1053, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1659, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1395, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 754, in invoke
    return __callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/typer/main.py", line 500, in wrapper
    return callback(**use_params)  # type: ignore
  File "/usr/local/lib/python3.6/dist-packages/spacy/cli/train.py", line 45, in train_cli
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
  File "/usr/local/lib/python3.6/dist-packages/spacy/cli/train.py", line 75, in train
    train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 126, in train
    save_checkpoint(False)
  File "/usr/local/lib/python3.6/dist-packages/spacy/training/loop.py", line 67, in save_checkpoint
    before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
  File "/usr/local/lib/python3.6/dist-packages/spacy/language.py", line 1988, in to_disk
    util.to_disk(path, serializers, exclude)
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1286, in to_disk
    writer(path / key)
  File "/usr/local/lib/python3.6/dist-packages/spacy/language.py", line 1986, in <lambda>
    serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])  # type: ignore[misc]
  File "spacy/pipeline/trainable_pipe.pyx", line 318, in spacy.pipeline.trainable_pipe.TrainablePipe.to_disk
  File "/usr/local/lib/python3.6/dist-packages/spacy/util.py", line 1286, in to_disk
    writer(path / key)
  File "spacy/pipeline/trainable_pipe.pyx", line 317, in spacy.pipeline.trainable_pipe.TrainablePipe.to_disk.lambda7
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 525, in to_disk
    file_.write(self.to_bytes())
  File "/usr/local/lib/python3.6/dist-packages/thinc/model.py", line 517, in to_bytes
    return srsly.msgpack_dumps(msg)
  File "/usr/local/lib/python3.6/dist-packages/srsly/_msgpack_api.py", line 14, in msgpack_dumps
    return msgpack.dumps(data, use_bin_type=True)
  File "/usr/local/lib/python3.6/dist-packages/srsly/msgpack/__init__.py", line 55, in packb
    return Packer(**kwargs).pack(o)
  File "srsly/msgpack/_packer.pyx", line 285, in srsly.msgpack._packer.Packer.pack
  File "srsly/msgpack/_packer.pyx", line 291, in srsly.msgpack._packer.Packer.pack
  File "srsly/msgpack/_packer.pyx", line 288, in srsly.msgpack._packer.Packer.pack
  File "srsly/msgpack/_packer.pyx", line 235, in srsly.msgpack._packer.Packer._pack
  File "srsly/msgpack/_packer.pyx", line 264, in srsly.msgpack._packer.Packer._pack
  File "srsly/msgpack/_packer.pyx", line 235, in srsly.msgpack._packer.Packer._pack
  File "srsly/msgpack/_packer.pyx", line 235, in srsly.msgpack._packer.Packer._pack

Your Environment

polm commented 2 years ago

Since this seems more like a question than a bug report I'm moving it to Discussions.