openpsi-project / ReaLHF

Super-Efficient RLHF Training of LLMs with Parameter Reallocation
Apache License 2.0
82 stars 4 forks source link

worker ERROR: Worker encountered error 'id' #43

Closed AIRobotZhang closed 1 month ago

AIRobotZhang commented 1 month ago

20240722-23:36:21.266 worker ERROR: Worker encountered error 'id' Traceback (most recent call last): File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.267 Model Worker INFO: Model worker 7 exit with status ERROR. 20240722-23:36:21.267 system ERROR: Worker model_worker/7 failed with exception: 'id' 20240722-23:36:21.267 system ERROR: Traceback (most recent call last): File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id'

Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/ReaLHF/realhf/apps/remote.py", line 242, in main() File "/root/ReaLHF/realhf/apps/remote.py", line 238, in main args.func(args) File "/root/ReaLHF/realhf/apps/remote.py", line 92, in main_worker realhf.system.run_worker( File "/root/ReaLHF/realhf/system/init.py", line 58, in run_worker raise e File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.269 worker ERROR: Worker encountered error 'id' Traceback (most recent call last): File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.270 Model Worker INFO: Model worker 5 exit with status ERROR. 20240722-23:36:21.270 system ERROR: Worker model_worker/5 failed with exception: 'id' 20240722-23:36:21.270 system ERROR: Traceback (most recent call last): File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, **cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id'

Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/ReaLHF/realhf/apps/remote.py", line 242, in main() File "/root/ReaLHF/realhf/apps/remote.py", line 238, in main args.func(args) File "/root/ReaLHF/realhf/apps/remote.py", line 92, in main_worker realhf.system.run_worker( File "/root/ReaLHF/realhf/system/init.py", line 58, in run_worker raise e File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.279 worker ERROR: Worker encountered error 'id' Traceback (most recent call last): File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.280 Model Worker INFO: Model worker 6 exit with status ERROR. 20240722-23:36:21.280 system ERROR: Worker model_worker/6 failed with exception: 'id' 20240722-23:36:21.280 system ERROR: Traceback (most recent call last): File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, **cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id'

Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/ReaLHF/realhf/apps/remote.py", line 242, in main() File "/root/ReaLHF/realhf/apps/remote.py", line 238, in main args.func(args) File "/root/ReaLHF/realhf/apps/remote.py", line 92, in main_worker realhf.system.run_worker( File "/root/ReaLHF/realhf/system/init.py", line 58, in run_worker raise e File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.285 worker ERROR: Worker encountered error 'id' Traceback (most recent call last): File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id' 20240722-23:36:21.286 Model Worker INFO: Model worker 4 exit with status ERROR. 20240722-23:36:21.286 system ERROR: Worker model_worker/4 failed with exception: 'id' 20240722-23:36:21.287 system ERROR: Traceback (most recent call last): File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, **cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id'

Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/ReaLHF/realhf/apps/remote.py", line 242, in main() File "/root/ReaLHF/realhf/apps/remote.py", line 238, in main args.func(args) File "/root/ReaLHF/realhf/apps/remote.py", line 92, in main_worker realhf.system.run_worker( File "/root/ReaLHF/realhf/system/init.py", line 58, in run_worker raise e File "/root/ReaLHF/realhf/system/init.py", line 54, in run_worker worker.run() File "/root/ReaLHF/realhf/system/worker_base.py", line 660, in run raise e File "/root/ReaLHF/realhf/system/worker_base.py", line 631, in run r = self._poll() File "/root/ReaLHF/realhf/system/model_worker.py", line 798, in _poll self.lazy_setup() File "/root/ReaLHF/realhf/system/model_worker.py", line 252, in lazy_setup datasets = [ File "/root/ReaLHF/realhf/system/model_worker.py", line 253, in data_api.make_dataset( File "/root/ReaLHF/realhf/api/core/data_api.py", line 680, in make_dataset return dataset_cls(util=util, **cfg.args) File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in init self.ids = [x["id"] for x in data] File "/root/ReaLHF/realhf/impl/dataset/prompt_dataset.py", line 38, in self.ids = [x["id"] for x in data] KeyError: 'id'

AIRobotZhang commented 1 month ago

The PPO-prompt.jsonl needs key "id"