Open localcitizen opened 3 weeks ago
The training script from the main documentation does not work
from colbert.infra.run import Run from colbert.infra.config import ColBERTConfig, RunConfig from colbert import Trainer def train(): with Run().context(RunConfig(nranks=1)): triples = 'data/triples.train.colbert.jsonl' queries = 'data/queries.train.colbert.tsv' collection = 'data/corpus.train.colbert.tsv' config = ColBERTConfig(bsize=32, lr=1e-05, warmup=20_000, doc_maxlen=180, dim=128, attend_to_mask_tokens=False, nway=64, accumsteps=1, similarity='cosine', use_ib_negatives=True) trainer = Trainer(triples=triples, queries=queries, collection=collection, config=config) trainer.train(checkpoint='colbert-ir/colbertv1.9') if __name__ == '__main__': train()
The output is following:
/Users/user/Documents/dev/clbt_env/bin/python /Users/user/Documents/dev/data/Colbert_train_v2.py #> Starting... /Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( { "query_token_id": "[unused0]", "doc_token_id": "[unused1]", "query_token": "[Q]", "doc_token": "[D]", "ncells": null, "centroid_score_threshold": null, "ndocs": null, "load_index_with_mmap": false, "index_path": null, "index_bsize": 64, "nbits": 1, "kmeans_niters": 4, "resume": false, "similarity": "cosine", "bsize": 32, "accumsteps": 1, "lr": 1e-5, "maxsteps": 500000, "save_every": null, "warmup": 20000, "warmup_bert": null, "relu": false, "nway": 64, "use_ib_negatives": true, "reranker": false, "distillation_alpha": 1.0, "ignore_scores": false, "model_name": null, "query_maxlen": 32, "attend_to_mask_tokens": false, "interaction": "colbert", "dim": 128, "doc_maxlen": 180, "mask_punctuation": true, "checkpoint": "colbert-ir\/colbertv1.9", "triples": "data\/triples.train.colbert.jsonl", "collection": "data\/corpus.train.colbert.tsv", "queries": "data\/queries.train.colbert.tsv", "index_name": null, "overwrite": false, "root": "\/Users\/user\/Documents\/dev\/data\/experiments", "experiment": "default", "index_root": null, "name": "2024-06\/07\/20.42.54", "rank": 0, "nranks": 1, "amp": true, "gpus": 0, "avoid_fork_if_possible": false } Using config.bsize = 32 (per process) and config.accumsteps = 1 [Jun 07, 20:43:00] #> Loading the queries from data/queries.train.colbert.tsv ... [Jun 07, 20:43:00] #> Got 2 queries. All QIDs are unique. [Jun 07, 20:43:00] #> Loading collection... 0M [Jun 07, 20:43:01] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)... Process Process-1: Traceback (most recent call last): File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap self.run() File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/colbert/infra/launcher.py", line 134, in setup_new_process return_val = callee(config, *args) File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/colbert/training/training.py", line 55, in train colbert = torch.nn.parallel.DistributedDataParallel(colbert, device_ids=[config.rank], File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 705, in __init__ self._log_and_throw( File "/Users/user/Documents/dev/clbt_env/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1089, in _log_and_throw raise err_type(err_msg) ValueError: DistributedDataParallel device_ids and output_device arguments only work with single-device/multiple-device GPU modules or CPU modules, but got device_ids [0], output_device 0, and module parameters {device(type='cpu')}.
I found the couple points related to the ValueError problem:
Problem with accelerator ddp2 Problem with moving model to args
Unfortunately, I could not find how it might help to tackle the problem
python libs: colbert-ai==0.2.19 transformers==4.41.2 torch==2.3.0
Workstation: MacBook Pro Chip Apple M1 Pro MacOS 14.5 (23F79)
Could you share how it could be resolved, please?
The training script from the main documentation does not work
The output is following:
I found the couple points related to the ValueError problem:
Problem with accelerator ddp2 Problem with moving model to args
Unfortunately, I could not find how it might help to tackle the problem
python libs: colbert-ai==0.2.19 transformers==4.41.2 torch==2.3.0
Workstation: MacBook Pro Chip Apple M1 Pro MacOS 14.5 (23F79)
Could you share how it could be resolved, please?