aphp / edspdf

EDS-PDF is a generic, pure-Python framework for text extraction from PDF documents. It provides the machinery to use rule- or machine-learning-based approaches to classify text blocs between body and meta-data.
https://aphp.github.io/edspdf/
BSD 3-Clause "New" or "Revised" License
41 stars 6 forks source link

chore: bump version to 0.9.2 #30

Closed percevalw closed 3 days ago

percevalw commented 3 days ago

Changed

Fixed

sonarcloud[bot] commented 3 days ago

Quality Gate Passed Quality Gate passed

Issues
0 New issues
0 Accepted issues

Measures
0 Security Hotspots
0.0% Coverage on New Code
0.0% Duplication on New Code

See analysis details on SonarQube Cloud

github-actions[bot] commented 3 days ago

Coverage Report

NameStmtsMiss∆ MissCover
TOTAL313476097.57%
Files without new missing coverage
NameStmtsMiss∆ MissCover
edspdf/visualization/annotations.py

Was already missing at line 67

     elif isinstance(colors, list):
-         colors = {label: color for label, color in zip(unique_labels, colors)}

321096.88%
edspdf/utils/optimization.py

Was already missing at line 29

     def param_groups(self, value):
-         self.optim.param_groups = value
Was already missing at line 33
     def state(self):
-         return self.optim.state
Was already missing at line 37
     def state(self, value):
-         self.optim.state = value

683095.59%
edspdf/utils/lazy_module.py

Was already missing at line 92

         """
-         return __all__

311096.77%
edspdf/utils/file_system.py

Was already missing at line 26

 ) -> list:
-     return [
         os.path.join(dirpath, f)

241095.83%
edspdf/utils/collections.py

Was already missing at line 149

     def __getstate__(self):
-         return {"seq": self.seq}
Was already missing at lines 152-154
     def __setstate__(self, state):
-         self.seq = state["seq"]
-         self.flatten = None
Was already missing at lines 252-254
         base[attr] = val
-     except (KeyError, TypeError):
-         setattr(base, attr, val)
     return base

1665096.99%
edspdf/utils/alignment.py

Was already missing at line 19

     if len(src_boxes) == 0 or len(dst_boxes) == 0:
-         return []

301096.67%
edspdf/trainable_pipe.py

Was already missing at line 76

         if cache_key in cache:
-             return cache[cache_key]
         res = fn(self, doc)
Was already missing at lines 349-357
         """
-         batch = [
  ...
-         return batch
Was already missing at line 380
             if hasattr(self, "compiled"):
-                 res = self.compiled(batch)
             else:
Was already missing at line 459
                 if pipe_overrides:
-                     overrides[name] = pipe_overrides
         tensor_dict = {

2028096.04%
edspdf/structures.py

Was already missing at line 186

     def page(self):
-         return next(p for p in self.doc.pages if p.page_num == self.page_num)
Was already missing at line 192
         if self_page_num < other_page_num:
-             return True
         if self_page_num > other_page_num:
Was already missing at line 194
         if self_page_num > other_page_num:
-             return False
Was already missing at line 222

-         return ((self.y0 + self.y1) / 2, (self.x0 + self.x1) / 2) < (
             (other.y0 + other.y1) / 2,
Was already missing at line 250
     def __str__(self):
-         return self.text

975094.85%
edspdf/registry.py

Was already missing at lines 112-114

                     raise
-                 except ConfitValidationError as e:
-                     errors.append(e.raw_errors)
             if not errors:

902097.78%
edspdf/processing/utils.py

Was already missing at line 15

                 if isinstance(res, types.GeneratorType):
-                     results.extend(res)
                 else:
Was already missing at lines 66-78
 ) -> Iterable[List[T]]:
-     batch = []
  ...
-         yield batch

5513076.36%
edspdf/processing/simple.py

Was already missing at lines 27-29

         no_grad = sys.modules["torch"].no_grad
-     except (KeyError, AttributeError):
-         no_grad = nullcontext
     reader = lc.reader

512096.08%
edspdf/processing/multiprocessing.py

Was already missing at lines 226-230

 if os.environ.get("TORCH_SHARING_STRATEGY"):
-     try:
-         torch.multiprocessing.set_sharing_strategy(os.environ["TORCH_SHARING_STRATEGY"])
-     except NameError:
-         pass
Was already missing at line 248
         def save_align_devices_hook(pickler: Any, obj: Any):
-             pickler.save_reduce(load_align_devices_hook, (obj.__dict__,), obj=obj)
Was already missing at lines 251-258
         def load_align_devices_hook(state):
-             state["execution_device"] = MAP_LOCATION
  ...
-         AlignDevicesHook = None
Was already missing at line 416
                 if lc.sort_chunks:
-                     docs.sort(
                         key=doc_size_fns.get(
Was already missing at line 456

-             new_batch_iterator = None
Was already missing at line 495
                 if task is None and stage == self.exchanger.num_stages + 1:
-                     return
                 # Non prioritized STOP signal: there are no more tasks to process
Was already missing at lines 577-579
                     else:
-                         batch = gpu_pipe.prepare_batch(docs, device=device)
-                         inputs = None
                     active_batches[batch_id] = (docs, task_id, inputs)
Was already missing at line 1016
                 if v is not None:
-                     os.environ[k] = v

40617095.81%
edspdf/pipes/extractors/pdfminer.py

Was already missing at line 161

                     if len(text) == 0:
-                         continue
                     content_boxes.append(
Was already missing at line 222
             else:
-                 fontname, italic, bold = (None, None, None)
         else:

882097.73%
edspdf/pipeline.py

Was already missing at line 992

             if overrides:
-                 config = config.merge(overrides)
             pwd = os.getcwd()

3351099.70%
edspdf/lazy_collection.py

Was already missing at lines 323-326

         """Moves the pipeline to a given device"""
-         for name, pipe, *_ in self.torch_components():
-             pipe.to(device)
-         return self

1203097.50%
edspdf/layers/relative_attention.py

Was already missing at lines 157-159

         if head_size is None and key_size is not None:
-             assert key_size % n_heads == 0
-             head_size = key_size // n_heads
         value_head_size = None
Was already missing at line 172
         ):
-             self.register_buffer("position_embedding", position_embedding)
         else:
Was already missing at line 177
         if same_key_query_proj:
-             self.content_query_proj = self.content_key_proj
         else:
Was already missing at line 196
             if same_key_query_proj or same_positional_key_query_proj:
-                 self.position_query_proj = self.position_key_proj
             else:
Was already missing at line 355
             if mask.ndim == 3:
-                 mask = mask[:, :, :, None]
Was already missing at line 366

-         return attn

1177094.02%
edspdf/data/parquet.py

Was already missing at line 49

             # read in worker -> each task is a non yet parsed line
-             return (
                 (line, 1)

1071099.07%
edspdf/data/pandas.py

Was already missing at line 101

             if isinstance(rec, dict):
-                 rec.pop(FILENAME, None)
         return records, len(records)

441097.73%
edspdf/data/files.py

Was already missing at lines 100-102

             if self.load_annotations and self.filesystem.exists(json_path):
-                 with self.filesystem.open(json_path) as f:
-                     record["annotations"] = json.load(f)

1022098.04%

35 files skipped due to complete coverage.

Coverage success: total of 97.57% is above 97.57% 🎉