Open TalalAhmed311 opened 1 year ago
I'm having the same problem:
Traceback (most recent call last):
File "/usr/local/easybuild_allnodes/software/Python/3.10.4-GCCcore-11.3.0/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/easybuild_allnodes/software/Python/3.10.4-GCCcore-11.3.0/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/asy51/repos/graphmaster/sweep.py", line 53, in agent
wandb.agent(sweep_id, function=run, count=None, project='chess')
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/wandb_agent.py", line 637, in agent
wandb_sdk.wandb_login._login(_silent=True)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 277, in _login
wlogin.setup(kwargs)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 111, in setup
_logger = wandb.setup()._get_logger()
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 327, in setup
ret = _setup(settings=settings)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 320, in _setup
wl = _WandbSetup(settings=settings)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 303, in __init__
_WandbSetup._instance = _WandbSetup__WandbSetup(settings=settings, pid=pid)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 114, in __init__
self._setup()
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 250, in _setup
self._setup_manager()
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_setup.py", line 277, in _setup_manager
self._manager = wandb_manager._Manager(settings=self._settings)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_manager.py", line 163, in __init__
wandb._sentry.reraise(e)
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/analytics/sentry.py", line 146, in reraise
raise exc.with_traceback(sys.exc_info()[2])
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_manager.py", line 161, in __init__
self._service_connect()
File "/home/asy51/venv/lib/python3.10/site-packages/wandb/sdk/wandb_manager.py", line 125, in _service_connect
raise ManagerConnectionRefusedError(message)
wandb.sdk.wandb_manager.ManagerConnectionRefusedError: Connection to wandb service failed: [Errno 111] Connection refused.
I integrated my model with wandb, and when it crashed I tried to resume it, but it threw an error
Command for Running:
!python train.py --workers 8 --device 0 --epochs 200 --batch-size 8 --data data/data.yaml --project yolov7-exp05 --save_period 1 --weights yolov7.pt --img 640 640 --cfg cfg/training/yolov7.yaml --name yolov7 --hyp data/hyp.scratch.p5.yaml
Above you can see I don't use the
--upload_dataset
parameter, is this command important for resuming purposes?Resume Run
!python train.py --resume wandb-artifact://liqteq/yolov7-exp05/3rrt5k5i
Error Message:
YOLOR 🚀 v0.1-121-g2fdc7f1 torch 1.13.0+cu116 CUDA:0 (Tesla T4, 15109.75MB)
Namespace(adam=False, artifact_alias='latest', batch_size=16, bbox_interval=-1, bucket='', cache_images=False, cfg='', data='data/coco.yaml', device='', entity=None, epochs=300, evolve=False, exist_ok=False, freeze=[0], global_rank=-1, hyp='data/hyp.scratch.p5.yaml', image_weights=False, img_size=[640, 640], label_smoothing=0.0, linear_lr=False, local_rank=-1, multi_scale=False, name='exp', noautoanchor=False, nosave=False, notest=False, project='runs/train', quad=False, rect=False, resume='wandb-artifact://liqteq/yolov7-exp05/3rrt5k5i', save_dir='runs/train/exp4', save_period=-1, single_cls=False, sync_bn=False, total_batch_size=16, upload_dataset=False, v5_metric=False, weights='yolo7.pt', workers=8, world_size=1) tensorboard: Start with 'tensorboard --logdir runs/train', view at http://localhost:6006/ hyperparameters: lr0=0.01, lrf=0.1, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=0.3, cls_pw=1.0, obj=0.7, obj_pw=1.0, iou_t=0.2, anchor_t=4.0, fl_gamma=0.0, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, degrees=0.0, translate=0.2, scale=0.9, shear=0.0, perspective=0.0, flipud=0.0, fliplr=0.5, mosaic=1.0, mixup=0.15, copy_paste=0.0, paste_in=0.15, loss_ota=1 Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py", line 1105, in init wi.setup(kwargs) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py", line 167, in setup self._wl = wandb_setup.setup() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 307, in setup ret = _setup(settings=settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 302, in _setup wl = _WandbSetup(settings=settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 288, in init _WandbSetup._instance = _WandbSetupWandbSetup(settings=settings, pid=pid) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 106, in init self._setup() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 234, in _setup self._setup_manager() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 262, in _setup_manager self._manager = wandb_manager._Manager(settings=self._settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_manager.py", line 129, in init svc_iface._svc_connect(port=port) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/service/service_sock.py", line 30, in _svc_connect self._sock_client.connect(port=port) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/lib/sock_client.py", line 102, in connect s.connect(("localhost", port)) ConnectionRefusedError: [Errno 111] Connection refused wandb: ERROR Abnormal program exit Traceback (most recent call last): File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py", line 1105, in init wi.setup(kwargs) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py", line 167, in setup self._wl = wandb_setup.setup() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 307, in setup ret = _setup(settings=settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 302, in _setup wl = _WandbSetup(settings=settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 288, in init _WandbSetup._instance = _WandbSetupWandbSetup(settings=settings, pid=pid) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 106, in init self._setup() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 234, in _setup self._setup_manager() File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_setup.py", line 262, in _setup_manager self._manager = wandb_manager._Manager(settings=self._settings) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_manager.py", line 129, in init svc_iface._svc_connect(port=port) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/service/service_sock.py", line 30, in _svc_connect self._sock_client.connect(port=port) File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/lib/sock_client.py", line 102, in connect s.connect(("localhost", port)) ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "train.py", line 616, in
train(hyp, opt, device, tb_writer)
File "train.py", line 72, in train
wandb_logger = WandbLogger(opt, Path(opt.save_dir).stem, run_id, data_dict)
File "/content/yolov7/utils/wandb_logging/wandb_utils.py", line 92, in init
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py", line 1145, in init
raise Exception("problem") from error_seen
Exception: problem