nestauk / dap_medium_articles

The code behind Data Analytics at Nesta's Medium Articles
MIT License
7 stars 9 forks source link

Spancat Notebook - IndexError: [E035] Error creating span with start 99199355 and end 1 for Doc of length 39. #15

Open guy4261 opened 1 year ago

guy4261 commented 1 year ago

Hi! First and foremost - thank you for these notebooks!!!

In your spancat notebook, I have followed your lead and used your data.csv but sadly, I am getting an error. This is with spacy==3.5.3, which is the latest version as I am writing this (June 15, 2023).

In [8], which in your notebook is the following cell:

#start training the spancat component 
all_losses = []
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in tqdm(range(10)):
        # shuffling examples before every iteration
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(list(batch), losses=losses, drop=0.1, sgd=sgd)
        print("epoch: {} Losses: {}".format(iteration, str(losses)))
        all_losses.append(losses['spancat'])

I am getting this error right when I start running:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[72], line 10
      8 batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
      9 for batch in batches:
---> 10     nlp.update(list(batch), losses=losses, drop=0.1, sgd=sgd)
     11 print("epoch: {} Losses: {}".format(iteration, str(losses)))
     12 all_losses.append(losses['spancat'])

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/language.py:1155, in Language.update(self, examples, _, drop, sgd, losses, component_cfg, exclude, annotates)
   1152 for name, proc in self.pipeline:
   1153     # ignore statements are used here because mypy ignores hasattr
   1154     if name not in exclude and hasattr(proc, "update"):
-> 1155         proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
   1156     if sgd not in (None, False):
   1157         if (
   1158             name not in exclude
   1159             and isinstance(proc, ty.TrainableComponent)
   1160             and proc.is_trainable
   1161             and proc.model not in (True, False, None)
   1162         ):

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/pipeline/spancat.py:547, in SpanCategorizer.update(self, examples, drop, sgd, losses)
    545 set_dropout_rate(self.model, drop)
    546 scores, backprop_scores = self.model.begin_update((docs, spans))
--> 547 loss, d_scores = self.get_loss(examples, (spans, scores))
    548 backprop_scores(d_scores)  # type: ignore
    549 if sgd is not None:

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/pipeline/spancat.py:585, in SpanCategorizer.get_loss(self, examples, spans_scores)
    583     end = int(spans_i[j, 1])  # type: ignore
    584     spans_index[(start, end)] = offset + j
--> 585 for gold_span in self._get_aligned_spans(eg):
    586     key = (gold_span.start, gold_span.end)
    587     if key in spans_index:

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/pipeline/spancat.py:659, in SpanCategorizer._get_aligned_spans(self, eg)
    658 def _get_aligned_spans(self, eg: Example):
--> 659     return eg.get_aligned_spans_y2x(
    660         eg.reference.spans.get(self.key, []), allow_overlap=True
    661     )

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/training/example.pyx:294, in spacy.training.example.Example.get_aligned_spans_y2x()

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/training/example.pyx:299, in spacy.training.example.Example._get_aligned_spans()

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/tokens/span_group.pyx:169, in __iter__()

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/tokens/span_group.pyx:106, in spacy.tokens.span_group.SpanGroup.__getitem__()

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/tokens/span.pxd:16, in spacy.tokens.span.Span.cinit()

File ~/miniconda3/envs/sections/lib/python3.8/site-packages/spacy/tokens/span.pyx:101, in spacy.tokens.span.Span.__cinit__()

IndexError: [E035] Error creating span with start 99199355 and end 1 for Doc of length 39.
guy4261 commented 1 year ago

Right now I found a way to overcome this, but I'm still not sure what will become of the final model :sweat_smile:

def try_repr(iteration, jiteration, obj):
    try:
        repr(obj)
        return True
    except:
        return False

with nlp.disable_pipes(*unaffected_pipes):
    for iteration in tqdm(range(10)):
        ...
        batches = list(minibatch(train_data, size=compounding(4.0, 32.0, 1.001)))
        for batch in tqdm(batches):
            lst = [_ for j, _ in enumerate(batch) if try_repr(iteration, j, _)]  # <= filter out bad records
            nlp.update(lst, losses=losses, drop=0.1, sgd=sgd)
guy4261 commented 1 year ago

Right now I found a way to overcome this, but I'm still not sure what will become of the final model :sweat_smile:

def try_repr(iteration, jiteration, obj):
    try:
        repr(obj)
        return True
    except:
        return False

with nlp.disable_pipes(*unaffected_pipes):
    for iteration in tqdm(range(10)):
        ...
        batches = list(minibatch(train_data, size=compounding(4.0, 32.0, 1.001)))
        for batch in tqdm(batches):
            lst = [_ for j, _ in enumerate(batch) if try_repr(iteration, j, _)]  # <= filter out bad records
            nlp.update(lst, losses=losses, drop=0.1, sgd=sgd)