hltcoe / patapsco

Cross language information retrieval pipeline
Other
18 stars 6 forks source link

JSONDecodeError when using demo notebook PSQ pipeline #45

Closed andreaschari closed 2 years ago

andreaschari commented 2 years ago

when running demo-ecir.ipynb the PSQ pipeline crashes with a JSONDecodeError.

2022-06-10 08:20:38,675 - patapsco.run - INFO - Patapsco version 1.0.0-dev
2022-06-10 08:20:38,677 - patapsco.run - INFO - Writing output to: /nfs/patapsco/samples/notebooks/runs/PSQ
2022-06-10 08:20:38,684 - patapsco.retrieve - INFO - Index location: /nfs/patapsco/samples/notebooks/runs/query-translation/index
2022-06-10 08:20:38,685 - patapsco.job - INFO - Stage 2 is a streaming pipeline.
2022-06-10 08:20:38,685 - patapsco.job - INFO - Stage 2 pipeline: Hc4JsonTopicReader | TopicProcessor | QueryProcessor | QueryWriter | PyseriniRetriever | JsonResultsWriter | TrecResultsWriter
2022-06-10 08:20:38,685 - patapsco.job - INFO - Starting run: PSQ
2022-06-10 08:20:38,686 - patapsco.job - INFO - Stage 2: Starting processing of topics
2022-06-10 08:20:39,320 - patapsco.text - INFO - Loading the xx spacy model

---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
<ipython-input-6-a1ecf69e214e> in <module>
     65 
     66 runner = patapsco.Runner(config_psq)
---> 67 runner.run()

~/.local/lib/python3.7/site-packages/patapsco/run.py in run(self)
     39     def run(self):
     40         sub_job_flag = self.job_type == JobType.MAP
---> 41         self.job.run(sub_job=sub_job_flag)
     42 
     43     @staticmethod

~/.local/lib/python3.7/site-packages/patapsco/job.py in run(self, sub_job)
     89         LOGGER.info("Starting run: %s", self.conf.run.name)
     90 
---> 91         report = self._run()
     92 
     93         if not sub_job:

~/.local/lib/python3.7/site-packages/patapsco/job.py in _run(self)
    151             LOGGER.info("Stage 2: Starting processing of topics")
    152             with timer2:
--> 153                 self.stage2.run()
    154             report.stage2 = StageReport(self.stage2.count, self.stage2.report)
    155             LOGGER.info("Stage 2: Processed %d topics", self.stage2.count)

~/.local/lib/python3.7/site-packages/patapsco/pipeline.py in run(self)
    164 
    165     def run(self):
--> 166         self.begin()
    167         for item in self.iterator:
    168             for task in self.tasks:

~/.local/lib/python3.7/site-packages/patapsco/pipeline.py in begin(self)
    138         self.count = 0
    139         for task in self.tasks:
--> 140             task.begin()
    141 
    142     def end(self):

~/.local/lib/python3.7/site-packages/patapsco/pipeline.py in begin(self)
     98 
     99     def begin(self):
--> 100         self.task.begin()
    101 
    102     def end(self):

~/.local/lib/python3.7/site-packages/patapsco/topics.py in begin(self)
    526             processor = TextProcessor(self.run_path, text_config, self.psq_config.lang)
    527             processor.begin()  # load models
--> 528             self.generator = PSQGenerator(processor, self.psq_config.path, self.psq_config.threshold)
    529         elif self.parse:
    530             self.generator = LuceneQueryGenerator(self)

~/.local/lib/python3.7/site-packages/patapsco/topics.py in __init__(self, processor, psq_path, threshold)
    427         super().__init__(processor)
    428         try:
--> 429             self.psq_table = parse_psq_table(psq_path, threshold)
    430         except OSError as e:
    431             raise ConfigError(f"Unable to load PSQ translation table: {e}")

~/.local/lib/python3.7/site-packages/patapsco/util/formats.py in parse_psq_table(path, threshold)
    136     norm = functools.partial(normalize_psq_entry, cum_thresh=threshold)
    137     with open(path) as fp:
--> 138         trans_table = json.load(fp)
    139         return {k: norm(v) for k, v in trans_table.items()}

/opt/conda/lib/python3.7/json/__init__.py in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
--> 296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
    297 
    298 

/opt/conda/lib/python3.7/json/__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    346             parse_int is None and parse_float is None and
    347             parse_constant is None and object_pairs_hook is None and not kw):
--> 348         return _default_decoder.decode(s)
    349     if cls is None:
    350         cls = JSONDecoder

/opt/conda/lib/python3.7/json/decoder.py in decode(self, s, _w)
    335 
    336         """
--> 337         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338         end = _w(s, end).end()
    339         if end != len(s):

/opt/conda/lib/python3.7/json/decoder.py in raw_decode(self, s, idx)
    353             obj, end = self.scan_once(s, idx)
    354         except StopIteration as err:
--> 355             raise JSONDecodeError("Expecting value", s, err.value) from None
    356         return obj, end

JSONDecodeError: Expecting value: line 8 column 1 (char 7)
eugene-yang commented 2 years ago

Thanks for raising this. I was putting the wrong link for the PSQ table. It should be fixed now. Please reopen this issue if you still cannot run it :)