Open localcitizen opened 5 months ago
The training script from the main documentation does not work
from colbert.infra.run import Run from colbert.infra.config import ColBERTConfig, RunConfig from colbert import Trainer def train(): with Run().context(RunConfig(nranks=1)): triples = 'data/triples.train.colbert.jsonl' queries = 'data/queries.train.colbert.tsv' collection = 'data/corpus.train.colbert.tsv' config = ColBERTConfig(bsize=32, lr=1e-05, warmup=20_000, doc_maxlen=180, dim=128, attend_to_mask_tokens=False, nway=64, accumsteps=1, similarity='cosine', use_ib_negatives=True) trainer = Trainer(triples=triples, queries=queries, collection=collection, config=config) trainer.train(checkpoint='colbert-ir/colbertv1.9') if __name__ == '__main__': train()
The output is following:
/Users/user/Documents/dev/clbt_env/bin/python /Users/user/Documents/dev/data/Colbert_train_v2.py #> Starting... /Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( { "query_token_id": "[unused0]", "doc_token_id": "[unused1]", "query_token": "[Q]", "doc_token": "[D]", "ncells": null, "centroid_score_threshold": null, "ndocs": null, "load_index_with_mmap": false, "index_path": null, "index_bsize": 64, "nbits": 1, "kmeans_niters": 4, "resume": false, "similarity": "cosine", "bsize": 32, "accumsteps": 1, "lr": 1e-5, "maxsteps": 500000, "save_every": null, "warmup": 20000, "warmup_bert": null, "relu": false, "nway": 64, "use_ib_negatives": true, "reranker": false, "distillation_alpha": 1.0, "ignore_scores": false, "model_name": null, "query_maxlen": 32, "attend_to_mask_tokens": false, "interaction": "colbert", "dim": 128, "doc_maxlen": 180, "mask_punctuation": true, "checkpoint": "colbert-ir\/colbertv1.9", "triples": "data\/triples.train.colbert.jsonl", "collection": "data\/corpus.train.colbert.tsv", "queries": "data\/queries.train.colbert.tsv", "index_name": null, "overwrite": false, "root": "\/Users\/user\/Documents\/dev\/data\/experiments", "experiment": "default", "index_root": null, "name": "2024-06\/07\/20.42.54", "rank": 0, "nranks": 1, "amp": true, "gpus": 0, "avoid_fork_if_possible": false } Using config.bsize = 32 (per process) and config.accumsteps = 1 [Jun 07, 20:43:00] #> Loading the queries from data/queries.train.colbert.tsv ... [Jun 07, 20:43:00] #> Got 2 queries. All QIDs are unique. [Jun 07, 20:43:00] #> Loading collection... 0M [Jun 07, 20:43:01] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)... Process Process-1: Traceback (most recent call last): File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap self.run() File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process return_val = callee(config, *args) File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/colbert/training/training.py", line 55, in train colbert = torch.nn.parallel.DistributedDataParallel(colbert, device_ids=[config.rank], File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 705, in __init__ self._log_and_throw( File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1089, in _log_and_throw raise err_type(err_msg) ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [0], output_device 0, and module parameters {device(type='cpu')}.
I found the couple points related to the ValueError problem:
Problem with accelerator ddp2 Problem with moving model to args
Unfortunately, I could not find how it might help to tackle the problem
python libs: colbert-ai==0.2.19 transformers==4.41.2 torch==2.3.0
Workstation: MacBook Pro Chip Apple M1 Pro MacOS 14.5 (23F79)
Could you share how it could be resolved, please?
I’m encountering the same issue. Do you have a solution on your end?
The training script from the main documentation does not work
The output is following:
I found the couple points related to the ValueError problem:
Problem with accelerator ddp2 Problem with moving model to args
Unfortunately, I could not find how it might help to tackle the problem
python libs: colbert-ai==0.2.19 transformers==4.41.2 torch==2.3.0
Workstation: MacBook Pro Chip Apple M1 Pro MacOS 14.5 (23F79)
Could you share how it could be resolved, please?