fix: changing scorers dict size issue when evaluating during training

Coverage Report

Name	Stmts	Miss	∆ Miss	Cover
edsnlp/training/trainer.py Was already missing at line 72 if result is None: - result = {} if isinstance(x, dict): New missing coverage at lines 170-177 ! for name, scorer in scorers.items(): - pred_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")] - preds = list( - nlp.pipe(tqdm(pred_docs, desc="Predicting")).set_processing( - batch_size=self.batch_size - ) - ) - scores[name] = scorer(docs, preds)	225	4	7	98.22%
TOTAL	10453	214	7	97.95%

Name

Stmts

Miss

∆ Miss

Cover

edsnlp/training/trainer.py

     if result is None:
-         result = {}
     if isinstance(x, dict):

New missing coverage at lines 170-177 !

         for name, scorer in scorers.items():
-             pred_docs = [d.copy() for d in tqdm(docs, desc="Copying docs")]
-             preds = list(
-                 nlp.pipe(tqdm(pred_docs, desc="Predicting")).set_processing(
-                     batch_size=self.batch_size
-                 )
-             )
-             scores[name] = scorer(docs, preds)

225

98.22%

TOTAL

10453

214

97.95%

Files without new missing coverage

Name	Stmts	Miss	Cover
edsnlp/utils/torch.py Was already missing at line 102 def load_pruned_obj(obj, _): - return obj Was already missing at line 118 def save_align_devices_hook(pickler, obj): - pickler.save_reduce(load_align_devices_hook, (obj.__dict__,), obj=obj) Was already missing at lines 121-128 def load_align_devices_hook(state): - state["execution_device"] = MAP_LOCATION ... - AlignDevicesHook = None Was already missing at line 143 if torch.Tensor in copyreg.dispatch_table: - old_dispatch[torch.Tensor] = copyreg.dispatch_table[torch.Tensor] copyreg.pickle(torch.Tensor, reduce_empty)	83	9	89.16%
edsnlp/utils/span_getters.py Was already missing at lines 52-55 else: - for span in candidates: - if span.label_ in span_filter: - yield span Was already missing at lines 59-61 if span_getter is None: - yield doc[:], None - return if callable(span_getter): Was already missing at lines 62-64 if callable(span_getter): - yield from span_getter(doc) - return for key, span_filter in span_getter.items(): Was already missing at line 66 if key == "*": - candidates = ( (span, group) for group in doc.spans.values() for span in group Was already missing at lines 75-78 else: - for span, group in candidates: - if span.label_ in span_filter: - yield span, group Was already missing at line 82 if callable(span_setter): - span_setter(doc, matches) else: Was already missing at line 124 elif isinstance(v, str): - new_value[k] = [v] elif isinstance(v, list) and all(isinstance(i, str) for i in v): Was already missing at line 162 elif isinstance(v, str): - new_value[k] = [v] elif isinstance(v, list) and all(isinstance(i, str) for i in v):	149	14	90.60%
edsnlp/utils/resources.py Was already missing at line 33 if not verbs: - return conjugated_verbs	24	1	95.83%
edsnlp/utils/numbers.py Was already missing at line 34 else: - string = s string = string.lower().strip() Was already missing at lines 38-41 return int(string) - except ValueError: - parsed = DIGITS_MAPPINGS.get(string, None) - return parsed	16	4	75.00%
edsnlp/utils/lazy_module.py Was already missing at line 46 ): - continue for import_node in node.body:	31	1	96.77%
edsnlp/utils/filter.py Was already missing at line 206 if isinstance(label, int): - return [span for span in spans if span.label == label] else:	74	1	98.65%
edsnlp/utils/bindings.py Was already missing at line 23 return "." + path - return path	65	1	98.46%
edsnlp/utils/batching.py Was already missing at line 288 else: # drop - continue batch.append(item) Was already missing at line 347 else: # drop - continue batch.append(item)	187	2	98.93%
edsnlp/processing/spark.py Was already missing at line 50 getActiveSession = SparkSession.getActiveSession - except AttributeError:	47	1	97.87%
edsnlp/processing/multiprocessing.py Was already missing at lines 386-391 self.on_stop() - except BaseException as e: ... - self.main_control_queue.put(e) finally: Was already missing at lines 395-397 pass - except StopSignal: - pass for name, queue in self.consumer_queues(stage): Was already missing at line 532 while schedule[task_idx] is None: - task_idx = (task_idx + 1) % len(schedule) Was already missing at lines 596-598 if isinstance(docs, StreamSentinel): - self.active_batches[stage].append([None, None, None, docs]) - continue batch_id = str(hash(tuple(id(x) for x in docs)))[-8:] + "-" + self.uid Was already missing at line 754 if self.stop and not stop_mode: - raise StopSignal() Was already missing at lines 1112-1119 if out[0].kind == requires_sentinel: - missing_sentinels -= 1 ... - continue if requires_sentinel:	629	16	97.46%
edsnlp/processing/deprecated_pipe.py Was already missing at lines 207-209 def converter(doc): - res = results_extractor(doc) - return ( [{"note_id": doc._.note_id, **row} for row in res]	57	2	96.49%
edsnlp/pipes/trainable/span_linker/span_linker.py Was already missing at lines 402-404 if self.reference_mode == "synonym": - embeds = embeds.to(new_lin.weight) - new_lin.weight.data = embeds else:	173	2	98.84%
edsnlp/pipes/trainable/span_classifier/span_classifier.py Was already missing at line 345 if not all(keep_bindings): - logger.warning( "Some attributes have no labels or values and have been removed:"	159	1	99.37%
edsnlp/pipes/trainable/ner_crf/ner_crf.py Was already missing at line 254 if self.labels is not None and not self.infer_span_setter: - return Was already missing at lines 262-264 if callable(self.target_span_getter): - for span in get_spans(doc, self.target_span_getter): - inferred_labels.add(span.label_) else:	160	3	98.12%
edsnlp/pipes/trainable/layers/crf.py Was already missing at line 21 # out: 2 * N * O - return (log_A.unsqueeze(-1) + log_B.unsqueeze(-3)).logsumexp(-2) Was already missing at line 29 # out: 2 * N * O - return (log_A.unsqueeze(-1) + log_B.unsqueeze(-3)).max(-2) Was already missing at line 98 if learnable_transitions: - self.transitions = torch.nn.Parameter( torch.zeros_like(forbidden_transitions, dtype=torch.float) Was already missing at line 108 if learnable_transitions and with_start_end_transitions: - self.start_transitions = torch.nn.Parameter( torch.zeros(num_tags, dtype=torch.float) Was already missing at line 117 if learnable_transitions and with_start_end_transitions: - self.end_transitions = torch.nn.Parameter( torch.zeros(num_tags, dtype=torch.float)	137	5	96.35%
edsnlp/pipes/trainable/embeddings/transformer/transformer.py Was already missing at line 165 if quantization is not None: - kwargs["quantization_config"] = quantization Was already missing at line 185 if self.cls_token_id is None: - [self.cls_token_id] = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.special_tokens_map["bos_token"]] Was already missing at line 189 if self.sep_token_id is None: - [self.sep_token_id] = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.special_tokens_map["eos_token"]]	166	3	98.19%
edsnlp/pipes/qualifiers/reported_speech/reported_speech.py Was already missing at lines 24-28 return "REPORTED" - elif token._.rspeech is False: - return "DIRECT" - else: - return None	99	3	96.97%
edsnlp/pipes/qualifiers/negation/negation.py Was already missing at line 28 else: - return None	99	1	98.99%
edsnlp/pipes/qualifiers/hypothesis/hypothesis.py Was already missing at line 27 else: - return None	96	1	98.96%
edsnlp/pipes/qualifiers/history/history.py Was already missing at lines 26-32 def history_getter(token: Union[Token, Span]) -> Optional[str]: - if token._.history is True: - return "ATCD" - elif token._.history is False: - return "CURRENT" - else: - return None Was already missing at lines 337-343 ) - except ValueError: ... - note_datetime = None Was already missing at lines 352-358 ) - except ValueError: ... - birth_datetime = None Was already missing at lines 424-427 ) - except ValueError as e: - absolute_date = None - logger.warning( "In doc {}, the following date {} raises this error: {}. "	177	14	92.09%
edsnlp/pipes/qualifiers/family/family.py Was already missing at line 27 else: - return None	81	1	98.77%
edsnlp/pipes/qualifiers/base.py Was already missing at line 178 def __call__(self, doc: Doc) -> Doc: - results = self.process(doc) raise NotImplementedError(f"{type(results)} should be used to tag the document")	50	1	98.00%
edsnlp/pipes/ner/tnm/model.py Was already missing at line 147 def __str__(self): - return self.norm() Was already missing at line 171 ) - exclude_unset = skip_defaults	112	2	98.21%
edsnlp/pipes/ner/scores/sofa/sofa.py Was already missing at line 32 if not assigned: - continue if assigned.get("method_max") is not None: Was already missing at line 40 else: - method = "Non précisée"	25	2	92.00%
edsnlp/pipes/ner/scores/elston_ellis/patterns.py Was already missing at line 26 if x <= 5: - return 1 Was already missing at lines 32-36 else: - return 3 - - except ValueError: - return None	21	4	80.95%
edsnlp/pipes/ner/scores/charlson/patterns.py Was already missing at lines 21-23 return int(extracted_score) - except ValueError: - return None	13	2	84.62%
edsnlp/pipes/ner/scores/base_score.py Was already missing at line 154 if value is None: - continue normalized_value = self.score_normalization(value)	47	1	97.87%
edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py Was already missing at lines 130-136 for span in spans: - span.label_ = "solid_tumor" ... - yield span	37	6	83.78%
edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py Was already missing at line 107 if "peripheral" not in span._.assigned.keys(): - continue	15	1	93.33%
edsnlp/pipes/ner/disorders/diabetes/diabetes.py Was already missing at line 131 # Mostly FP - continue Was already missing at line 134 elif self.has_far_complications(span): - span._.status = 2 Was already missing at line 146 if next(iter(self.complication_matcher(context)), None) is not None: - return True return False	31	3	90.32%
edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py Was already missing at line 103 # Huge change of FP / Title section - continue	14	1	92.86%
edsnlp/pipes/ner/disorders/ckd/ckd.py Was already missing at lines 120-123 dfg_value = float(dfg_span.text.replace(",", ".").strip()) - except ValueError: - logger.trace(f"DFG value couldn't be extracted from {dfg_span.text}") - return False	29	3	89.66%
edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py Was already missing at lines 111-113 if span._.source == "ischemia": - if "brain" not in span._.assigned.keys(): - continue	17	2	88.24%
edsnlp/pipes/ner/adicap/models.py Was already missing at line 15 def norm(self) -> str: - return self.code Was already missing at line 18 def __str__(self): - return self.norm()	16	2	87.50%
edsnlp/pipes/misc/split/split.py Was already missing at lines 175-177 if max_length <= 0 and self.regex is None: - yield doc - return	70	2	97.14%
edsnlp/pipes/misc/sections/sections.py Was already missing at line 126 if sections is None: - sections = patterns.sections sections = dict(sections)	45	1	97.78%
edsnlp/pipes/misc/quantities/quantities.py Was already missing at lines 147-149 def __getitem__(self, item: int): - assert isinstance(item, int) - return [self][item] Was already missing at lines 160-163 def __eq__(self, other: Any): - if isinstance(other, SimpleQuantity): - return self.convert_to(other.unit) == other.value - return False Was already missing at line 166 if other.unit == self.unit: - return self.__class__(self.value + other.value, self.unit, self.registry) return self.__class__( Was already missing at line 193 return self.convert_to(other_unit) - except KeyError: raise AttributeError(f"Unit {other_unit} not found") Was already missing at line 198 def verify(cls, ent): - return True Was already missing at line 237 def __lt__(self, other: Union[SimpleQuantity, "RangeQuantity"]): - return max(self.convert_to(other.unit)) < min((part.value for part in other)) Was already missing at line 248 return self.convert_to(other.unit) == other.value - return False Was already missing at line 262 def verify(cls, ent): - return True Was already missing at line 861 if snippet.end != last and doclike.doc[last: snippet.end].text.strip() == "": - pseudo.append("w") pseudo = "".join(pseudo) Was already missing at line 1042 if start_line is None: - continue Was already missing at lines 1073-1075 unit_norm = self.unit_followers[unit_before.label_] - except (KeyError, AttributeError, IndexError): - pass Was already missing at line 1118 ): - ent = doc[unit_text.start: number.end] else: Was already missing at lines 1125-1127 dims = self.unit_registry.parse_unit(unit_norm)[0] - except KeyError: - continue Was already missing at lines 1233-1235 last._.set(last.label_, new_value) - except (AttributeError, TypeError): - merged.append(ent) else:	439	20	95.44%
edsnlp/pipes/misc/dates/models.py Was already missing at line 156 else: - d["month"] = note_datetime.month if self.day is None: Was already missing at lines 160-166 else: - if self.year is None: ... - d["day"] = default_day Was already missing at lines 174-176 return dt - except ValueError: - return None Was already missing at line 192 else: - return None Was already missing at line 208 if self.second: - norm += f"{self.second:02}s"	199	11	94.47%
edsnlp/pipes/misc/dates/dates.py Was already missing at line 249 if isinstance(absolute, str): - absolute = [absolute] if isinstance(relative, str): Was already missing at line 251 if isinstance(relative, str): - relative = [relative] if isinstance(duration, str): Was already missing at line 253 if isinstance(duration, str): - relative = [duration] if isinstance(false_positive, str): Was already missing at lines 357-366 if self.merge_mode == "align": - alignments = align_spans(matches, spans, sort_by_overlap=True) ... - matches.append(span) Was already missing at line 451 elif d1 in seen or v1.bound is None or v2.bound is None: - continue Was already missing at lines 462-464 if v1.mode == Mode.DURATION: - m1 = Bound.FROM if v2.bound == Bound.UNTIL else Bound.UNTIL - m2 = v2.mode or Bound.FROM elif v2.mode == Mode.DURATION:	153	15	90.20%
edsnlp/pipes/misc/consultation_dates/consultation_dates.py Was already missing at line 131 else: - self.date_matcher = None Was already missing at line 134 if not consultation_mention: - consultation_mention = [] elif consultation_mention is True:	48	2	95.83%
edsnlp/pipes/core/normalizer/__init__.py Was already missing at line 7 def excluded_or_space_getter(t): - return t.is_space or t.tag_ == "EXCLUDED"	5	1	80.00%
edsnlp/pipes/core/endlines/endlines.py Was already missing at lines 156-160 if end_lines_model is None: - path = build_path(__file__, "base_model.pkl") - - with open(path, "rb") as inp: - self.model = pickle.load(inp) elif isinstance(end_lines_model, str): Was already missing at lines 163-165 self.model = pickle.load(inp) - elif isinstance(end_lines_model, EndLinesModel): - self.model = end_lines_model else: Was already missing at line 196 ): - return "ENUMERATION" Was already missing at line 283 if np.isnan(sigma): - sigma = 1	87	7	91.95%
edsnlp/pipes/core/contextual_matcher/models.py Was already missing at lines 28-32 if isinstance(v, list): - assert ( - len(v) == 2 - ), "`window` should be a tuple/list of two integer, or a single integer" - v = tuple(v) if isinstance(v, int):	138	2	98.55%
edsnlp/pipes/core/contextual_matcher/contextual_matcher.py Was already missing at line 94 ) - label = label_name if label is None: Was already missing at line 343 if assigned is None: - continue if replace_entity:	143	2	98.60%
edsnlp/patch_spacy.py Was already missing at lines 67-69 # if module is reloaded. - existing_func = registry.factories.get(internal_name) - if not util.is_same_func(factory_func, existing_func): raise ValueError(	31	2	93.55%
edsnlp/package.py Was already missing at lines 475-477 version = version or pyproject["project"]["version"] - except (KeyError, TypeError): - version = "0.1.0" name = name or pyproject["project"]["name"] Was already missing at line 481 else: - main_package = None model_package = snake_case(name.lower())	207	3	98.55%
edsnlp/metrics/span_attributes.py Was already missing at lines 56-58 ) - assert attributes is None - attributes = kwargs.pop("qualifiers") if attributes is None:	71	2	97.18%
edsnlp/matchers/simstring.py Was already missing at line 280 if custom: - attr = attr[1:].lower() Was already missing at line 295 if custom: - token_text = getattr(token._, attr) else:	146	2	98.63%
edsnlp/language.py Was already missing at line 103 if last != begin: - logger.warning( "Missed some characters during"	51	1	98.04%
edsnlp/data/standoff.py Was already missing at line 38 def __init__(self, ann_file, line): - super().__init__(f"File {ann_file}, unrecognized Brat line {line}") Was already missing at line 192 ) - except Exception: raise Exception(	185	2	98.92%
edsnlp/data/polars.py Was already missing at line 35 if hasattr(data, "collect"): - data = data.collect() assert isinstance(data, pl.DataFrame)	54	1	98.15%
edsnlp/data/json.py Was already missing at line 81 return records - except Exception as e: raise Exception(f"Cannot read {file}: {e}")	112	1	99.11%
edsnlp/data/converters.py Was already missing at line 140 if "tokenizer" in CONTEXT[0]: - return CONTEXT[0]["tokenizer"] if _DEFAULT_TOKENIZER is None: Was already missing at line 668 if isinstance(converter, type): - return converter(**kwargs), {} return converter, validate_kwargs(converter, kwargs)	203	2	99.01%
edsnlp/core/torch_component.py Was already missing at line 392 if hasattr(self, "compiled"): - res = self.compiled(batch) else: Was already missing at line 438 """ - return self.preprocess(doc) Was already missing at line 463 if object.__repr__(self) in exclude: - return exclude.add(object.__repr__(self))	187	3	98.40%
edsnlp/core/stream.py Was already missing at lines 190-192 if isinstance(batch, StreamSentinel): - yield batch - continue results = [] Was already missing at lines 993-995 elif op.batch_fn is None: - batch_size = op.size - batch_fn = batchify else:	353	4	98.87%
edsnlp/core/pipeline.py Was already missing at line 605 if name in exclude: - continue if name not in components: Was already missing at lines 716-719 """ - res = Stream.ensure_stream(docs) - res = res.map(functools.partial(self.preprocess, supervision=supervision)) - return res	442	4	99.10%
edsnlp/connectors/omop.py Was already missing at line 69 if not isinstance(row.ents, list): - continue Was already missing at line 87 else: - doc.spans[span.label_].append(span) Was already missing at line 127 if df.note_id.isna().any(): - df["note_id"] = range(len(df)) Was already missing at line 171 if i > 0: - df.term_modifiers += ";" df.term_modifiers += ext + "=" + df[ext].astype(str)	84	4	95.24%

264 files skipped due to complete coverage.

Coverage failure: total of 97.95% is less than 97.98% ❌

aphp / edsnlp

fix: changing scorers dict size issue when evaluating during training #347

Description

Checklist

Quality Gate passed

Coverage Report