Open chaokunyang opened 2 years ago
exception in main pool is:
2022-03-01 14:02:56,853 ERROR execution.py:132 -- Failed to run subtask qne7KJAZozYOBtG8lPx1gqyr on band numa-0
Traceback (most recent call last):
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/worker/execution.py", line 68, in _retry_run
return await target_async_func(*args)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/worker/execution.py", line 384, in _run_subtask_once
return await asyncio.shield(aiotask)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/debug.py", line 232, in task_with_ex_logged
return await coro
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/subtask/api.py", line 69, in run_subtask_in_slot
subtask
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 186, in send
return self._process_result_message(result)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 70, in _process_result_message
raise message.as_instanceof_cause()
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 590, in send
result = await self._run_coro(message.message_id, coro)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 343, in _run_coro
return await coro
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/api.py", line 115, in __on_receive__
return await super().__on_receive__(message)
File "mars/oscar/core.pyx", line 371, in __on_receive__
raise ex
File "mars/oscar/core.pyx", line 365, in mars.oscar.core._BaseActor.__on_receive__
return await self._handle_actor_result(result)
File "mars/oscar/core.pyx", line 250, in _handle_actor_result
task_result = await coros[0]
File "mars/oscar/core.pyx", line 293, in mars.oscar.core._BaseActor._run_actor_async_generator
async with self._lock:
File "mars/oscar/core.pyx", line 294, in mars.oscar.core._BaseActor._run_actor_async_generator
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 299, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await gen.athrow(*res)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/subtask/worker/runner.py", line 110, in run_subtask
result = yield self._running_processor.run(subtask)
File "mars/oscar/core.pyx", line 304, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await self._handle_actor_result(res)
File "mars/oscar/core.pyx", line 224, in _handle_actor_result
result = await result
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 186, in send
return self._process_result_message(result)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 70, in _process_result_message
raise message.as_instanceof_cause()
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 590, in send
result = await self._run_coro(message.message_id, coro)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 343, in _run_coro
return await coro
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/api.py", line 115, in __on_receive__
return await super().__on_receive__(message)
File "mars/oscar/core.pyx", line 371, in __on_receive__
raise ex
File "mars/oscar/core.pyx", line 365, in mars.oscar.core._BaseActor.__on_receive__
return await self._handle_actor_result(result)
File "mars/oscar/core.pyx", line 250, in _handle_actor_result
task_result = await coros[0]
File "mars/oscar/core.pyx", line 293, in mars.oscar.core._BaseActor._run_actor_async_generator
async with self._lock:
File "mars/oscar/core.pyx", line 294, in mars.oscar.core._BaseActor._run_actor_async_generator
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 299, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await gen.athrow(*res)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/subtask/worker/processor.py", line 622, in run
result = yield self._running_aio_task
File "mars/oscar/core.pyx", line 304, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await self._handle_actor_result(res)
File "mars/oscar/core.pyx", line 224, in _handle_actor_result
result = await result
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/debug.py", line 232, in task_with_ex_logged
return await coro
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/subtask/worker/processor.py", line 466, in run
input_keys = await self._load_input_data()
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/subtask/worker/processor.py", line 145, in _load_input_data
inputs = await self._storage_api.get.batch(*gets)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/batch.py", line 144, in _async_batch
return await self.batch_func(args_list, kwargs_list)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/storage/api/oscar.py", line 101, in batch_get
return await self._storage_handler_ref.get.batch(*gets)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 186, in send
return self._process_result_message(result)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/context.py", line 70, in _process_result_message
raise message.as_instanceof_cause()
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 590, in send
result = await self._run_coro(message.message_id, coro)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/backends/pool.py", line 343, in _run_coro
return await coro
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/oscar/api.py", line 115, in __on_receive__
return await super().__on_receive__(message)
File "mars/oscar/core.pyx", line 371, in __on_receive__
raise ex
File "mars/oscar/core.pyx", line 365, in mars.oscar.core._BaseActor.__on_receive__
return await self._handle_actor_result(result)
File "mars/oscar/core.pyx", line 250, in _handle_actor_result
task_result = await coros[0]
File "mars/oscar/core.pyx", line 293, in mars.oscar.core._BaseActor._run_actor_async_generator
async with self._lock:
File "mars/oscar/core.pyx", line 294, in mars.oscar.core._BaseActor._run_actor_async_generator
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 299, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await gen.athrow(*res)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/storage/handler.py", line 149, in batch_get
result = yield self._get_data(data_info, conditions)
File "mars/oscar/core.pyx", line 304, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await self._handle_actor_result(res)
File "mars/oscar/core.pyx", line 250, in _handle_actor_result
task_result = await coros[0]
File "mars/oscar/core.pyx", line 293, in mars.oscar.core._BaseActor._run_actor_async_generator
async with self._lock:
File "mars/oscar/core.pyx", line 294, in mars.oscar.core._BaseActor._run_actor_async_generator
with debug_async_timeout('actor_lock_timeout',
File "mars/oscar/core.pyx", line 299, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await gen.athrow(*res)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/storage/handler.py", line 88, in _get_data
res = yield self._clients[data_info.level].get(data_info.object_id)
File "mars/oscar/core.pyx", line 304, in mars.oscar.core._BaseActor._run_actor_async_generator
res = await self._handle_actor_result(res)
File "mars/oscar/core.pyx", line 224, in _handle_actor_result
result = await result
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/storage/ray.py", line 210, in get
return await object_id
types._MarsError: <unprintable _MarsError object>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/worker/execution.py", line 341, in internal_run_subtask
subtask, band_name, subtask_api, batch_quota_req
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/worker/execution.py", line 440, in _retry_run_subtask
return await _retry_run(subtask, subtask_info, _run_subtask_once)
File "/home/admin/ray-pack/tmp/job/8d010080/pyenv/lib/python3.7/site-packages/mars/services/scheduling/worker/execution.py", line 104, in _retry_run
f"Failed to rerun the {target_async_func} of subtask {subtask.subtask_id}, "
File "/home/admin/.local/lib/python3.7/site-packages/ray/exceptions.py", line 209, in __str__
return (f"Object {self.object_ref.hex()} is lost due to node failure.")
AttributeError: 'str' object has no attribute 'hex'
Describe the bug A clear and concise description of what the bug is.
To Reproduce To help us reproducing this bug, please provide information below: