aphp / edsnlp

Modular, fast NLP framework, compatible with Pytorch and spaCy, offering tailored support for French clinical notes.
https://aphp.github.io/edsnlp/
BSD 3-Clause "New" or "Revised" License
116 stars 30 forks source link

Fs encoding fix #320

Closed Aremaki closed 2 months ago

Aremaki commented 2 months ago

Fix FS encoding

Description

Force the fsspec open method to use "UTF-8" encoding.

Checklist

sonarcloud[bot] commented 2 months ago

Quality Gate Passed Quality Gate passed

Issues
0 New issues
0 Accepted issues

Measures
0 Security Hotspots
0.0% Coverage on New Code
0.0% Duplication on New Code

See analysis details on SonarCloud

github-actions[bot] commented 2 months ago

Coverage Report

NameStmtsMiss∆ MissCover
TOTAL9316209097.76%
Files without new missing coverage
NameStmtsMiss∆ MissCover
edsnlp/utils/span_getters.py

Was already missing at lines 52-55

         else:
-             for span in candidates:
-                 if span.label_ in span_filter:
-                     yield span
Was already missing at lines 59-61
     if span_getter is None:
-         yield doc[:], None
-         return
     if callable(span_getter):
Was already missing at lines 62-64
     if callable(span_getter):
-         yield from span_getter(doc)
-         return
     for key, span_filter in span_getter.items():
Was already missing at line 66
         if key == "*":
-             candidates = (
                 (span, group) for group in doc.spans.values() for span in group
Was already missing at lines 75-78
         else:
-             for span, group in candidates:
-                 if span.label_ in span_filter:
-                     yield span, group
Was already missing at line 82
     if callable(span_setter):
-         span_setter(doc, matches)
     else:
Was already missing at line 124
             elif isinstance(v, str):
-                 new_value[k] = [v]
             elif isinstance(v, list) and all(isinstance(i, str) for i in v):
Was already missing at line 162
             elif isinstance(v, str):
-                 new_value[k] = [v]
             elif isinstance(v, list) and all(isinstance(i, str) for i in v):

15314090.85%
edsnlp/utils/resources.py

Was already missing at line 33

     if not verbs:
-         return conjugated_verbs

241095.83%
edsnlp/utils/numbers.py

Was already missing at line 34

     else:
-         string = s
     string = string.lower().strip()
Was already missing at lines 38-41
         return int(string)
-     except ValueError:
-         parsed = DIGITS_MAPPINGS.get(string, None)
-         return parsed

164075.00%
edsnlp/utils/lazy_module.py

Was already missing at line 46

             ):
-                 continue
             for import_node in node.body:

311096.77%
edsnlp/utils/filter.py

Was already missing at line 206

     if isinstance(label, int):
-         return [span for span in spans if span.label == label]
     else:

741098.65%
edsnlp/utils/bindings.py

Was already missing at line 22

         return "." + path
-     return path

661098.48%
edsnlp/train.py

Was already missing at line 190

         else:
-             sample_len = lambda idx, noise=True: 1  # noqa: E731
Was already missing at lines 257-263
             if total + num_tokens > self.grad_accumulation_max_tokens:
-                 print(
  ...
-                 mini_batches.append([])
             total += num_tokens
Was already missing at line 349
             if 0 <= self.limit <= count:
-                 break
             if not (len(doc) and (filter_fn is None or filter_fn(doc))):
Was already missing at line 351
             if not (len(doc) and (filter_fn is None or filter_fn(doc))):
-                 continue
             count += 1
Was already missing at lines 385-387
             for ent in doc.ents:
-                 for token in ent:
-                     token.is_sent_start = False
             for sent in doc.sents if doc.has_annotation("SENT_START") else (doc[:],):

2578096.89%
edsnlp/processing/spark.py

Was already missing at line 51

         getActiveSession = SparkSession.getActiveSession
-     except AttributeError:

431097.67%
edsnlp/processing/multiprocessing.py

Was already missing at lines 227-231

 if os.environ.get("TORCH_SHARING_STRATEGY"):
-     try:
-         torch.multiprocessing.set_sharing_strategy(os.environ["TORCH_SHARING_STRATEGY"])
-     except NameError:
-         pass
Was already missing at line 249
         def save_align_devices_hook(pickler: Any, obj: Any):
-             pickler.save_reduce(load_align_devices_hook, (obj.__dict__,), obj=obj)
Was already missing at lines 252-259
         def load_align_devices_hook(state):
-             state["execution_device"] = MAP_LOCATION
  ...
-         AlignDevicesHook = None
Was already missing at line 452

-             new_batch_iterator = None
Was already missing at lines 570-572
                     else:
-                         batch = gpu_pipe.prepare_batch(docs, device=device)
-                         inputs = None
                     active_batches[batch_id] = (docs, task_id, inputs)
Was already missing at line 949
         if isinstance(outputs, BaseException):
-             raise outputs
Was already missing at line 1017
                 if v is not None:
-                     os.environ[k] = v

42016096.19%
edsnlp/processing/deprecated_pipe.py

Was already missing at lines 207-209

         def converter(doc):
-             res = results_extractor(doc)
-             return (
                 [{"note_id": doc._.note_id, **row} for row in res]

572096.49%
edsnlp/pipes/trainable/span_linker/span_linker.py

Was already missing at lines 401-403

             if self.reference_mode == "synonym":
-                 embeds = embeds.to(new_lin.weight)
-                 new_lin.weight.data = embeds
             else:

1722098.84%
edsnlp/pipes/trainable/ner_crf/ner_crf.py

Was already missing at line 250

         if self.labels is not None and not self.infer_span_setter:
-             return
Was already missing at lines 258-260
             if callable(self.target_span_getter):
-                 for span in get_spans(doc, self.target_span_getter):
-                     inferred_labels.add(span.label_)
             else:

1573098.09%
edsnlp/pipes/trainable/layers/crf.py

Was already missing at line 21

     # out: 2 * N * O
-     return (log_A.unsqueeze(-1) + log_B.unsqueeze(-3)).logsumexp(-2)
Was already missing at line 29
     # out: 2 * N * O
-     return (log_A.unsqueeze(-1) + log_B.unsqueeze(-3)).max(-2)
Was already missing at line 97
         if learnable_transitions:
-             self.transitions = torch.nn.Parameter(
                 torch.zeros_like(forbidden_transitions, dtype=torch.float)
Was already missing at line 107
         if learnable_transitions and with_start_end_transitions:
-             self.start_transitions = torch.nn.Parameter(
                 torch.zeros(num_tags, dtype=torch.float)
Was already missing at line 116
         if learnable_transitions and with_start_end_transitions:
-             self.end_transitions = torch.nn.Parameter(
                 torch.zeros(num_tags, dtype=torch.float)

1375096.35%
edsnlp/pipes/trainable/embeddings/transformer/transformer.py

Was already missing at line 165

         if quantization is not None:
-             kwargs["quantization_config"] = quantization

1571099.36%
edsnlp/pipes/qualifiers/reported_speech/reported_speech.py

Was already missing at lines 18-22

         return "REPORTED"
-     elif token._.rspeech is False:
-         return "DIRECT"
-     else:
-         return None

733095.89%
edsnlp/pipes/qualifiers/negation/negation.py

Was already missing at line 22

     else:
-         return None

771098.70%
edsnlp/pipes/qualifiers/hypothesis/hypothesis.py

Was already missing at line 21

     else:
-         return None

741098.65%
edsnlp/pipes/qualifiers/history/history.py

Was already missing at lines 20-26

 def history_getter(token: Union[Token, Span]) -> Optional[str]:
-     if token._.history is True:
-         return "ATCD"
-     elif token._.history is False:
-         return "CURRENT"
-     else:
-         return None
Was already missing at lines 310-316
                 )
-             except ValueError:
  ...
-                 note_datetime = None
Was already missing at lines 325-331
                 )
-             except ValueError:
  ...
-                 birth_datetime = None
Was already missing at lines 397-400
                         )
-                     except ValueError as e:
-                         absolute_date = None
-                         logger.warning(
                             "In doc {}, the following date {} raises this error: {}. "

15414090.91%
edsnlp/pipes/qualifiers/family/family.py

Was already missing at line 21

     else:
-         return None

591098.31%
edsnlp/pipes/ner/tnm/tnm.py

Was already missing at lines 156-158

                 value = TNM.parse_obj(groupdict)
-             except ValidationError:
-                 value = TNM.parse_obj({})

442095.45%
edsnlp/pipes/ner/tnm/model.py

Was already missing at line 139

     def __str__(self):
-         return self.norm()
Was already missing at line 163
             )
-             exclude_unset = skip_defaults

1042098.08%
edsnlp/pipes/ner/scores/sofa/sofa.py

Was already missing at line 32

             if not assigned:
-                 continue
             if assigned.get("method_max") is not None:
Was already missing at line 40
             else:
-                 method = "Non précisée"

252092.00%
edsnlp/pipes/ner/scores/elston_ellis/patterns.py

Was already missing at line 26

         if x <= 5:
-             return 1
Was already missing at lines 32-36
         else:
-             return 3
- 
-     except ValueError:
-         return None

214080.95%
edsnlp/pipes/ner/scores/charlson/patterns.py

Was already missing at lines 21-23

             return int(extracted_score)
-     except ValueError:
-         return None

132084.62%
edsnlp/pipes/ner/scores/base_score.py

Was already missing at line 154

             if value is None:
-                 continue
             normalized_value = self.score_normalization(value)

471097.87%
edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py

Was already missing at lines 114-117

         if use_tnm:
-             from edsnlp.pipes.ner.tnm import TNM
- 
-             self.tnm = TNM(nlp, pattern=None, attr="TEXT")
Was already missing at lines 119-129
     def process_tnm(self, doc):
-         spans = self.tnm.process(doc)
  ...
-             yield span
Was already missing at line 149
         if self.use_tnm:
-             yield from self.process_tnm(doc)

3512065.71%
edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py

Was already missing at line 106

                 if "peripheral" not in span._.assigned.keys():
-                     continue

151093.33%
edsnlp/pipes/ner/disorders/diabetes/diabetes.py

Was already missing at line 133

                 # Mostly FP
-                 continue
Was already missing at line 136
             elif self.has_far_complications(span):
-                 span._.status = 2
Was already missing at line 148
         if next(iter(self.complication_matcher(context)), None) is not None:
-             return True
         return False

313090.32%
edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py

Was already missing at line 102

                 # Huge change of FP / Title section
-                 continue

141092.86%
edsnlp/pipes/ner/disorders/ckd/ckd.py

Was already missing at lines 119-122

             dfg_value = float(dfg_span.text.replace(",", ".").strip())
-         except ValueError:
-             logger.trace(f"DFG value couldn't be extracted from {dfg_span.text}")
-             return False

293089.66%
edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py

Was already missing at lines 110-112

             if span._.source == "ischemia":
-                 if "brain" not in span._.assigned.keys():
-                     continue

172088.24%
edsnlp/pipes/ner/adicap/models.py

Was already missing at line 15

     def norm(self) -> str:
-         return self.code
Was already missing at line 18
     def __str__(self):
-         return self.norm()

142085.71%
edsnlp/pipes/misc/sections/sections.py

Was already missing at line 126

         if sections is None:
-             sections = patterns.sections
         sections = dict(sections)

451097.78%
edsnlp/pipes/misc/quantities/quantities.py

Was already missing at lines 147-149

     def __getitem__(self, item: int):
-         assert isinstance(item, int)
-         return [self][item]
Was already missing at lines 160-163
     def __eq__(self, other: Any):
-         if isinstance(other, SimpleQuantity):
-             return self.convert_to(other.unit) == other.value
-         return False
Was already missing at line 166
         if other.unit == self.unit:
-             return self.__class__(self.value + other.value, self.unit, self.registry)
         return self.__class__(
Was already missing at line 193
             return self.convert_to(other_unit)
-         except KeyError:
             raise AttributeError(f"Unit {other_unit} not found")
Was already missing at line 198
     def verify(cls, ent):
-         return True
Was already missing at line 237
     def __lt__(self, other: Union[SimpleQuantity, "RangeQuantity"]):
-         return max(self.convert_to(other.unit)) < min((part.value for part in other))
Was already missing at line 248
             return self.convert_to(other.unit) == other.value
-         return False
Was already missing at line 262
     def verify(cls, ent):
-         return True
Was already missing at line 866
         if snippet.end != last and doclike.doc[last: snippet.end].text.strip() == "":
-             pseudo.append("w")
         pseudo = "".join(pseudo)
Was already missing at line 1040
                             if start_line is None:
-                                 continue
Was already missing at lines 1071-1073
                         unit_norm = self.unit_followers[unit_before.label_]
-                 except (KeyError, AttributeError, IndexError):
-                     pass
Was already missing at line 1116
             ):
-                 ent = doc[unit_text.start: number.end]
             else:
Was already missing at lines 1123-1125
                 dims = self.unit_registry.parse_unit(unit_norm)[0]
-             except KeyError:
-                 continue
Was already missing at lines 1236-1238
                     last._.set(last.label_, new_value)
-                 except (AttributeError, TypeError):
-                     merged.append(ent)
             else:

43220095.37%
edsnlp/pipes/misc/dates/models.py

Was already missing at line 152

                     else:
-                         d["month"] = note_datetime.month
                 if self.day is None:
Was already missing at lines 156-162
             else:
-                 if self.year is None:
  ...
-                     d["day"] = default_day
Was already missing at lines 170-172
                 return dt
-             except ValueError:
-                 return None
Was already missing at line 188
         else:
-             return None
Was already missing at line 204
         if self.second:
-             norm += f"{self.second:02}s"

19611094.39%
edsnlp/pipes/misc/dates/dates.py

Was already missing at line 248

         if isinstance(absolute, str):
-             absolute = [absolute]
         if isinstance(relative, str):
Was already missing at line 250
         if isinstance(relative, str):
-             relative = [relative]
         if isinstance(duration, str):
Was already missing at line 252
         if isinstance(duration, str):
-             relative = [duration]
         if isinstance(false_positive, str):
Was already missing at lines 356-365
             if self.merge_mode == "align":
-                 alignments = align_spans(matches, spans, sort_by_overlap=True)
  ...
-                         matches.append(span)
Was already missing at line 450
             elif d1 in seen or v1.bound is None or v2.bound is None:
-                 continue
Was already missing at lines 461-463
                 if v1.mode == Mode.DURATION:
-                     m1 = Bound.FROM if v2.bound == Bound.UNTIL else Bound.UNTIL
-                     m2 = v2.mode or Bound.FROM
                 elif v2.mode == Mode.DURATION:

15215090.13%
edsnlp/pipes/misc/consultation_dates/consultation_dates.py

Was already missing at line 131

         else:
-             self.date_matcher = None
Was already missing at line 134
         if not consultation_mention:
-             consultation_mention = []
         elif consultation_mention is True:

482095.83%
edsnlp/pipes/core/normalizer/__init__.py

Was already missing at line 7

 def excluded_or_space_getter(t):
-     return t.is_space or t.tag_ == "EXCLUDED"

51080.00%
edsnlp/pipes/core/endlines/endlines.py

Was already missing at lines 151-155

         if end_lines_model is None:
-             path = build_path(__file__, "base_model.pkl")
- 
-             with open(path, "rb") as inp:
-                 self.model = pickle.load(inp)
         elif isinstance(end_lines_model, str):
Was already missing at lines 158-160
                 self.model = pickle.load(inp)
-         elif isinstance(end_lines_model, EndLinesModel):
-             self.model = end_lines_model
         else:
Was already missing at line 191
         ):
-             return "ENUMERATION"
Was already missing at line 278
         if np.isnan(sigma):
-             sigma = 1

877091.95%
edsnlp/pipes/core/contextual_matcher/models.py

Was already missing at lines 19-23

     if isinstance(v, list):
-         assert (
-             len(v) == 2
-         ), "`window` should be a tuple/list of two integer, or a single integer"
-         v = tuple(v)
     if isinstance(v, int):

1152098.26%
edsnlp/pipes/core/contextual_matcher/contextual_matcher.py

Was already missing at line 94

             )
-             label = label_name
         if label is None:
Was already missing at line 343
                 if assigned is None:
-                     continue
                 if replace_entity:

1432098.60%
edsnlp/patch_spacy.py

Was already missing at lines 67-69

             # if module is reloaded.
-             existing_func = registry.factories.get(internal_name)
-             if not util.is_same_func(factory_func, existing_func):
                 raise ValueError(

312093.55%
edsnlp/optimization.py

Was already missing at line 32

     def param_groups(self, value):
-         self.optim.param_groups = value
Was already missing at line 36
     def state(self):
-         return self.optim.state
Was already missing at line 40
     def state(self, value):
-         self.optim.state = value
Was already missing at line 89
     def __init__(self, groups):
-         self.param_groups = groups

774094.81%
edsnlp/matchers/simstring.py

Was already missing at line 280

     if custom:
-         attr = attr[1:].lower()
Was already missing at line 295
             if custom:
-                 token_text = getattr(token._, attr)
             else:

1462098.63%
edsnlp/language.py

Was already missing at line 103

             if last != begin:
-                 logger.warning(
                     "Missed some characters during"

511098.04%
edsnlp/data/standoff.py

Was already missing at line 43

     def __init__(self, ann_file, line):
-         super().__init__(f"File {ann_file}, unrecognized Brat line {line}")
Was already missing at line 83
     if not len(ann_paths):
-         return {
             "text": text,
Was already missing at line 197
                         )
-                 except Exception:
                     raise Exception(

1723098.26%
edsnlp/data/polars.py

Was already missing at line 26

         if hasattr(data, "collect"):
-             data = data.collect()
         assert isinstance(data, pl.DataFrame)

441097.73%
edsnlp/data/json.py

Was already missing at line 94

                     if not is_jsonl:
-                         obj[FILENAME] = filename
                     results.append(obj)
Was already missing at line 96
                     results.append(obj)
-             except Exception:
                 raise Exception(f"Cannot parse {filename}")

1072098.13%
edsnlp/data/converters.py

Was already missing at line 659

     if isinstance(converter, type) or kwargs_to_init:
-         return converter(**kwargs), {}
     return converter, validate_kwargs(converter, kwargs)

1921099.48%
edsnlp/data/base.py

Was already missing at lines 174-180

     """
-     data = LazyCollection.ensure_lazy(data)
-     if converter:
-         converter, kwargs = get_doc2dict_converter(converter, kwargs)
-         data = data.map(converter, kwargs=kwargs)
- 
-     return data

395087.18%
edsnlp/core/torch_component.py

Was already missing at line 390

             if hasattr(self, "compiled"):
-                 res = self.compiled(batch)
             else:
Was already missing at line 436
         """
-         return self.preprocess(doc)

1792098.88%
edsnlp/core/registries.py

Was already missing at line 78

             if obj.error is not None:
-                 raise obj.error

1641099.39%
edsnlp/core/pipeline.py

Was already missing at line 552

             if name in exclude:
-                 continue
             if name not in components:

4101099.76%
edsnlp/core/lazy_collection.py

Was already missing at line 51

     def __call__(self, *args, **kwargs):
-         return self.forward(*args, **kwargs)
Was already missing at line 448
         for name, pipe, *_ in self.torch_components():
-             pipe.to(device)
         return self

1512098.68%
edsnlp/connectors/omop.py

Was already missing at line 69

         if not isinstance(row.ents, list):
-             continue
Was already missing at line 87
             else:
-                 doc.spans[span.label_].append(span)
Was already missing at line 127
     if df.note_id.isna().any():
-         df["note_id"] = range(len(df))
Was already missing at line 171
         if i > 0:
-             df.term_modifiers += ";"
         df.term_modifiers += ext + "=" + df[ext].astype(str)

844095.24%

264 files skipped due to complete coverage.

Coverage success: total of 97.76% is above 97.76% 🎉