lean-dojo / LeanDojo

Tool for data extraction and interacting with Lean programmatically.
https://leandojo.org
MIT License
478 stars 72 forks source link

Minor bug about transitive_dep_graph missing '/lake/packages/std/Std/WF.lean' during locate_premise #145

Closed zoryzhang closed 4 months ago

zoryzhang commented 4 months ago

Description The following code works well for v5 benchmark, but fails when changing to v6 (https://zenodo.org/records/10800866). Given this error, will there be any chance for you to keep the v5 benchmark on aws remote cache before this is fixed? Thank you.

data = RetrievalDataModule(
    model_name="kaiyuy/leandojo-lean4-retriever-byt5-small",
    data_path= "/Users/zory/Desktop/work/AI2Reason/analogyTP/data/leandojo_benchmark_4/novel_premises/",
    corpus_path= "/Users/zory/Desktop/work/AI2Reason/analogyTP/data/leandojo_benchmark_4/corpus.jsonl",
    num_negatives= 3,
    num_in_file_negatives= 1,
    batch_size= 8,
    eval_batch_size= 64,
    max_seq_len= 1024,
    num_workers= 4,
)
data.setup("fit")

Detailed Steps to Reproduce the Behavior Simply run the above code. OTAH, data.setup("validate") does not throw any error.

Logs in Debug Mode Set the environment variable VERBOSE=1 and paste the logs here.

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
[/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb](Experiments/Zory/zory.ipynb) Cell 12 line 1
      [2](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=1) logger.add(sys.stderr, level="DEBUG")
      [3](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=2) data = RetrievalDataModule(
      [4](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=3)     model_name="kaiyuy/leandojo-lean4-retriever-byt5-small",
      [5](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=4)     data_path= "[/Users/zory/Desktop/work/AI2Reason/analogyTP/data/leandojo_benchmark_4/novel_premises/](data/leandojo_benchmark_4/novel_premises/)",
   (...)
     [12](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=11)     num_workers= 4,
     [13](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=12) )
---> [14](vscode-notebook-cell:/Users/zory/Desktop/work/AI2Reason/analogyTP/Experiments/Zory/zory.ipynb#X56sZmlsZQ%3D%3D?line=13) data.setup("fit")

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/retrieval/datamodule.py:237](ReProver/retrieval/datamodule.py:237), in RetrievalDataModule.setup(self, stage)
    [235](ReProver/retrieval/datamodule.py:235) def setup(self, stage: Optional[str] = None) -> None:
    [236](ReProver/retrieval/datamodule.py:236)     if stage in (None, "fit"):
--> [237](ReProver/retrieval/datamodule.py:237)         self.ds_train = RetrievalDataset(
    [238](ReProver/retrieval/datamodule.py:238)             [os.path.join(self.data_path, "train.json")],
    [239](ReProver/retrieval/datamodule.py:239)             self.uses_lean4,
    [240](ReProver/retrieval/datamodule.py:240)             self.corpus,
    [241](ReProver/retrieval/datamodule.py:241)             self.num_negatives,
    [242](ReProver/retrieval/datamodule.py:242)             self.num_in_file_negatives,
    [243](ReProver/retrieval/datamodule.py:243)             self.max_seq_len,
    [244](ReProver/retrieval/datamodule.py:244)             self.tokenizer,
    [245](ReProver/retrieval/datamodule.py:245)             is_train=True,
    [246](ReProver/retrieval/datamodule.py:246)         )
    [248](ReProver/retrieval/datamodule.py:248)     if stage in (None, "fit", "validate"):
    [249](ReProver/retrieval/datamodule.py:249)         self.ds_val = RetrievalDataset(
    [250](ReProver/retrieval/datamodule.py:250)             [os.path.join(self.data_path, "val.json")],
    [251](ReProver/retrieval/datamodule.py:251)             self.uses_lean4,
   (...)
    [257](ReProver/retrieval/datamodule.py:257)             is_train=False,
    [258](ReProver/retrieval/datamodule.py:258)         )

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/retrieval/datamodule.py:40](ReProver/retrieval/datamodule.py:40), in RetrievalDataset.__init__(self, data_paths, uses_lean4, corpus, num_negatives, num_in_file_negatives, max_seq_len, tokenizer, is_train)
     [38](ReProver/retrieval/datamodule.py:38) self.tokenizer = tokenizer
     [39](ReProver/retrieval/datamodule.py:39) self.is_train = is_train
---> [40](ReProver/retrieval/datamodule.py:40) self.data = list(
     [41](ReProver/retrieval/datamodule.py:41)     itertools.chain.from_iterable(self._load_data(path) for path in data_paths)
     [42](ReProver/retrieval/datamodule.py:42) )

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/retrieval/datamodule.py:41](ReProver/retrieval/datamodule.py:41), in <genexpr>(.0)
     [38](ReProver/retrieval/datamodule.py:38) self.tokenizer = tokenizer
     [39](ReProver/retrieval/datamodule.py:39) self.is_train = is_train
     [40](ReProver/retrieval/datamodule.py:40) self.data = list(
---> [41](ReProver/retrieval/datamodule.py:41)     itertools.chain.from_iterable(self._load_data(path) for path in data_paths)
     [42](ReProver/retrieval/datamodule.py:42) )

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/retrieval/datamodule.py:56](ReProver/retrieval/datamodule.py:56), in RetrievalDataset._load_data(self, data_path)
     [52](ReProver/retrieval/datamodule.py:52) state = format_state(tac["state_before"])
     [53](ReProver/retrieval/datamodule.py:53) context = Context(
     [54](ReProver/retrieval/datamodule.py:54)     file_path, thm["full_name"], Pos(*thm["start"]), state
     [55](ReProver/retrieval/datamodule.py:55) )
---> [56](ReProver/retrieval/datamodule.py:56) all_pos_premises = get_all_pos_premises(
     [57](ReProver/retrieval/datamodule.py:57)     tac["annotated_tactic"], self.corpus
     [58](ReProver/retrieval/datamodule.py:58) )
     [60](ReProver/retrieval/datamodule.py:60) if self.is_train:
     [61](ReProver/retrieval/datamodule.py:61)     # In training, we ignore tactics that do not have any premises.
     [62](ReProver/retrieval/datamodule.py:62)     for pos_premise in all_pos_premises:

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/common.py:347](ReProver/common.py:347), in get_all_pos_premises(annot_tac, corpus)
    [345](ReProver/common.py:345) for prov in provenances:
    [346](ReProver/common.py:346)     def_path = prov["def_path"]
--> [347](ReProver/common.py:347)     p = corpus.locate_premise(def_path, Pos(*prov["def_pos"]))
    [348](ReProver/common.py:348)     if p is not None:
    [349](ReProver/common.py:349)         all_pos_premises.add(p)

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/common.py:258](ReProver/common.py:258), in Corpus.locate_premise(self, path, pos)
    [253](ReProver/common.py:253) def locate_premise(self, path: str, pos: Pos) -> Optional[Premise]:
    [254](ReProver/common.py:254)     """Return a premise at position ``pos`` in file ``path``.
    [255](ReProver/common.py:255) 
    [256](ReProver/common.py:256)     Return None if no such premise can be found.
    [257](ReProver/common.py:257)     """
--> [258](ReProver/common.py:258)     for p in self.get_premises(path):
    [259](ReProver/common.py:259)         assert p.path == path
    [260](ReProver/common.py:260)         if p.start <= pos <= p.end:

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/common.py:247](ReProver/common.py:247), in Corpus.get_premises(self, path)
    [245](ReProver/common.py:245) def get_premises(self, path: str) -> List[Premise]:
    [246](ReProver/common.py:246)     """Return a list of premises defined in the file ``path``."""
--> [247](ReProver/common.py:247)     return self._get_file(path).premises

File [~/Desktop/work/AI2Reason/analogyTP/ReProver/common.py:222](ReProver/common.py:222), in Corpus._get_file(self, path)
    [221](ReProver/common.py:221) def _get_file(self, path: str) -> File:
--> [222](ReProver/common.py:222)     return self.transitive_dep_graph.nodes[path]["file"]

File [~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:194](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:194), in NodeView.__getitem__(self, n)
    [189](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:189) if isinstance(n, slice):
    [190](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:190)     raise nx.NetworkXError(
    [191](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:191)         f"{type(self).__name__} does not support slicing, "
    [192](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:192)         f"try list(G.nodes)[{n.start}:{n.stop}:{n.step}]"
    [193](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:193)     )
--> [194](Experiments/Zory/~/miniforge3/envs/py311/lib/python3.11/site-packages/networkx/classes/reportviews.py:194) return self._nodes[n]

KeyError: '/lake/packages/std/Std/WF.lean'

Platform Information

yangky11 commented 4 months ago

I'll take a look tmr.

yangky11 commented 4 months ago

I have fixed this bug and updated LeanDojo, ReProver, and the dataset on Zenodo. Could you please try deleting your local cache and upgrading to the latest versions? Let me know if you have any other issues!