Closed Bergylta closed 8 months ago
more issue: Input:
# @title <font size="5">↓ ឵឵<i>Add the data to Weights and Biases</font> { vertical-output: true }
mlp.save_detections_wandb(conf_thres.value, model.value, eval_dir.selected)
Output:
PermissionError Traceback (most recent call last)
Cell In[22], line 2
1 # @title <font size="5">↓ ឵឵<i>Add the data to Weights and Biases</font> { vertical-output: true }
----> 2 mlp.save_detections_wandb(conf_thres.value, model.value, eval_dir.selected)
File /usr/src/app/kso-dev/kso_utils/project.py:1338, in MLProjectProcessor.save_detections_wandb(self, conf_thres, model, eval_dir)
1336 def save_detections_wandb(self, conf_thres: float, model: str, eval_dir: str):
1337 self.modules["yolo_utils"].set_config(conf_thres, model, eval_dir)
-> 1338 self.modules["yolo_utils"].add_data_wandb(
1339 eval_dir, "detection_output", self.run
1340 )
1341 self.csv_report = self.modules["yolo_utils"].generate_csv_report(
1342 eval_dir, self.run, wandb_log=True
1343 )
File /usr/src/app/kso-dev/kso_utils/yolo_utils.py:1042, in add_data_wandb(path, name, run)
1031 """
1032 > The function `add_data_wandb` takes a path to a directory, a name for the directory, and a run
1033 object, and adds the directory to the run as an artifact
(...)
1039 :param run: The run object that you get from calling wandb.init()
1040 """
1041 my_data = wandb.Artifact(name, type="raw_data")
-> 1042 my_data.add_dir(path)
1043 run.log_artifact(my_data)
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_artifacts.py:423, in Artifact.add_dir(self, local_path, name)
421 num_threads = 8
422 pool = multiprocessing.dummy.Pool(num_threads)
--> 423 pool.map(add_manifest_file, paths)
424 pool.close()
425 pool.join()
File /usr/lib/python3.8/multiprocessing/pool.py:364, in Pool.map(self, func, iterable, chunksize)
359 def map(self, func, iterable, chunksize=None):
360 '''
361 Apply `func` to each element in `iterable`, collecting the results
362 in a list that is returned.
363 '''
--> 364 return self._map_async(func, iterable, mapstar, chunksize).get()
File /usr/lib/python3.8/multiprocessing/pool.py:771, in ApplyResult.get(self, timeout)
769 return self._value
770 else:
--> 771 raise self._value
File /usr/lib/python3.8/multiprocessing/pool.py:125, in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
File /usr/lib/python3.8/multiprocessing/pool.py:48, in mapstar(args)
47 def mapstar(args):
---> 48 return list(map(*args))
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_artifacts.py:417, in Artifact.add_dir.<locals>.add_manifest_file(log_phy_path)
415 def add_manifest_file(log_phy_path: Tuple[str, str]) -> None:
416 logical_path, physical_path = log_phy_path
--> 417 self._add_local_file(logical_path, physical_path)
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_artifacts.py:704, in Artifact._add_local_file(self, name, path, digest)
702 cache_path, hit, cache_open = self._cache.check_md5_obj_path(digest, size)
703 if not hit:
--> 704 with cache_open() as f:
705 shutil.copyfile(path, f.name)
707 entry = ArtifactManifestEntry(
708 name,
709 None,
(...)
712 local_path=cache_path,
713 )
File /usr/lib/python3.8/contextlib.py:113, in _GeneratorContextManager.__enter__(self)
111 del self.args, self.kwds, self.func
112 try:
--> 113 return next(self.gen)
114 except StopIteration:
115 raise RuntimeError("generator didn't yield") from None
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/interface/artifacts.py:957, in ArtifactsCache._cache_opener.<locals>.helper(mode)
948 dirname = os.path.dirname(path)
949 tmp_file = os.path.join(
950 dirname,
951 "%s_%s"
(...)
955 ),
956 )
--> 957 with util.fsync_open(tmp_file, mode=mode) as f:
958 yield f
960 try:
961 # Use replace where we can, as it implements an atomic
962 # move on most platforms. If it doesn't exist, we have
(...)
975 # it is critical that the temporary files sit directly in the cache --
976 # they need to be on the same filesystem!
File /usr/lib/python3.8/contextlib.py:113, in _GeneratorContextManager.__enter__(self)
111 del self.args, self.kwds, self.func
112 try:
--> 113 return next(self.gen)
114 except StopIteration:
115 raise RuntimeError("generator didn't yield") from None
File /usr/local/lib/python3.8/dist-packages/wandb/util.py:1486, in fsync_open(path, mode, encoding)
1478 @contextlib.contextmanager
1479 def fsync_open(
1480 path: Union[pathlib.Path, str], mode: str = "w", encoding: Optional[str] = None
1481 ) -> Generator[IO[Any], None, None]:
1482 """
1483 Opens a path for I/O, guaranteeing that the file is flushed and
1484 fsynced when the file's context expires.
1485 """
-> 1486 with open(path, mode, encoding=encoding) as f:
1487 yield f
1489 f.flush()
PermissionError: [Errno 13] Permission denied: '/mimer/NOBACKUP/groups/snic2021-6-9/artifacts/obj/md5/99/tmp_63B0DF12'
Ignore the upper ones, that was me being a dingus and not choosing the output folder
Here's a update on the thing with another error Input:
# @title <font size="5">↓ ឵឵<i>Add the data to Weights and Biases</font> { vertical-output: true }
mlp.save_detections_wandb(conf_thres.value, model.value, eval_dir.selected)
Output:
wandb: Adding directory to artifact (/mimer/NOBACKUP/groups/snic2021-6-9/tmp_dir/KSO_hardbottom_other_test/detect)... Done. 20.9s
INFO:root:Report created at /mimer/NOBACKUP/groups/snic2021-6-9/tmp_dir/KSO_hardbottom_other_test/detect/annotations.csv
Finishing last run (ID:1psvaz8q) before initializing another...
Waiting for W&B process to finish... (success).
Synced devout-serenity-367: https://wandb.ai/koster/model-evaluations/runs/1psvaz8q
Synced 5 W&B file(s), 0 media file(s), 14886 artifact file(s) and 0 other file(s)
Find logs at: /mimer/NOBACKUP/groups/snic2021-6-9/wandb/run-20231027_110938-1psvaz8q/logs
Successfully finished last run (ID:1psvaz8q). Initializing new run:
wandb: ERROR resume='must' but run (1psvaz8q) doesn't exist
Problem at: /usr/src/app/kso-dev/kso_utils/yolo_utils.py 1077 generate_csv_report
---------------------------------------------------------------------------
UsageError Traceback (most recent call last)
Cell In[15], line 2
1 # @title <font size="5">↓ ឵឵<i>Add the data to Weights and Biases</font> { vertical-output: true }
----> 2 mlp.save_detections_wandb(conf_thres.value, model.value, eval_dir.selected)
File /usr/src/app/kso-dev/kso_utils/project.py:1341, in MLProjectProcessor.save_detections_wandb(self, conf_thres, model, eval_dir)
1337 self.modules["yolo_utils"].set_config(conf_thres, model, eval_dir)
1338 self.modules["yolo_utils"].add_data_wandb(
1339 eval_dir, "detection_output", self.run
1340 )
-> 1341 self.csv_report = self.modules["yolo_utils"].generate_csv_report(
1342 eval_dir, self.run, wandb_log=True
1343 )
File /usr/src/app/kso-dev/kso_utils/yolo_utils.py:1077, in generate_csv_report(evaluation_path, run, wandb_log)
1075 logging.info("Report created at {}".format(csv_out))
1076 if wandb_log:
-> 1077 wandb.init(resume="must", id=run.id)
1078 wandb.log({"predictions": wandb.Table(dataframe=detect_df)})
1079 return detect_df
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py:1043, in init(job_type, dir, config, project, entity, reinit, tags, group, name, notes, magic, config_exclude_keys, config_include_keys, anonymous, mode, allow_val_change, resume, force, tensorboard, sync_tensorboard, monitor_gym, save_code, id, settings)
1041 except_exit = wi.settings._except_exit
1042 try:
-> 1043 run = wi.init()
1044 except_exit = wi.settings._except_exit
1045 except (KeyboardInterrupt, Exception) as e:
File /usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_init.py:691, in _WandbInit.init(self)
689 backend.cleanup()
690 self.teardown()
--> 691 raise UsageError(error_message)
692 assert run_result and run_result.run
693 if run_result.run.resumed:
UsageError: resume='must' but run (1psvaz8q) doesn't exist
@Bergylta Could you try restarting the run completely, i.e. restarting the notebook and rerunning the evaluation? Seems like it is trying to continue a run that did not save properly.
Tried, but the issue remains for now @jannesgg
Solved this issue by not needing to resume the run and instead breaking it up in separate runs to avoid wandb issues in future.
🐛 Bug
Seems to be an issue with saving the model evaluation run to WandB .
To Reproduce (REQUIRED)
output location: /mimer/NOBACKUP/groups/snic2021-6-9/tmp_dir/KSO_model_training_christian_anemones/ Input:
Output:
Expected behavior
A clear and concise description of what you expected to happen.
Environment
If applicable, add screenshots to help explain your problem.
Additional context