Netflix / metaflow-service

:rocket: Metadata tracking and UI service for Metaflow!
http://www.metaflow.org
Apache License 2.0
193 stars 71 forks source link

Metaflow UI log ERROR #374

Open LennieGuy opened 1 year ago

LennieGuy commented 1 year ago

Traceback (most recent call last): File "", line 198, in _run_module_as_main File "", line 88, in _run_code File "/root/services/ui_backend_service/data/cache/client/cache_server.py", line 307, in cli(auto_envvar_prefix='MFCACHE') File "/usr/local/lib/python3.11/site-packages/click/core.py", line 1128, in call return self.main(args, kwargs) File "/usr/local/lib/python3.11/site-packages/click/core.py", line 1053, in main rv = self.invoke(ctx) File "/usr/local/lib/python3.11/site-packages/click/core.py", line 1395, in invoke return ctx.invoke(self.callback, ctx.params) File "/usr/local/lib/python3.11/site-packages/click/core.py", line 754, in invoke return __callback(args, kwargs) File "/root/services/ui_backend_service/data/cache/client/cache_server.py", line 301, in cli Scheduler(store, max_actions).loop() File "/root/services/ui_backend_service/data/cache/client/cache_server.py", line 196, in init self.pool = multiprocessing.Pool( File "/usr/local/lib/python3.11/multiprocessing/context.py", line 119, in Pool return Pool(processes, initializer, initargs, maxtasksperchild, File "/usr/local/lib/python3.11/multiprocessing/pool.py", line 215, in init self._repopulate_pool() File "/usr/local/lib/python3.11/multiprocessing/pool.py", line 306, in _repopulate_pool return self._repopulate_pool_static(self._ctx, self.Process, File "/usr/local/lib/python3.11/multiprocessing/pool.py", line 329, in _repopulate_pool_static w.start() File "/usr/local/lib/python3.11/multiprocessing/process.py", line 121, in start self._popen = self._Popen(self) File "/usr/local/lib/python3.11/multiprocessing/context.py", line 281, in _Popen return Popen(process_obj) File "/usr/local/lib/python3.11/multiprocessing/popen_fork.py", line 19, in init self._launch(process_obj) File "/usr/local/lib/python3.11/multiprocessing/popen_fork.py", line 71, in _launch code = process_obj._bootstrap(parent_sentinel=child_r) File "/usr/local/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/local/lib/python3.11/multiprocessing/process.py", line 108, in run self._target(*self._args, *self._kwargs) File "/usr/local/lib/python3.11/multiprocessing/pool.py", line 125, in worker result = (True, func(args, kwds)) File "/root/services/ui_backend_service/data/cache/client/cache_worker.py", line 29, in execute_action execute(tempdir, action_cls, request) File "/root/services/ui_backend_service/data/cache/client/cache_worker.py", line 51, in execute res = action_cls.execute( File "/root/services/ui_backend_service/data/cache/get_log_file_action.py", line 133, in execute with streamed_errors(stream_output): File "/usr/local/lib/python3.11/contextlib.py", line 155, in exit self.gen.throw(typ, value, traceback) File "/root/services/ui_backend_service/data/cache/utils.py", line 130, in streamed_errors get_traceback_str() File "/root/services/ui_backend_service/data/cache/utils.py", line 124, in streamed_errors yield File "/root/services/ui_backend_service/data/cache/get_log_file_action.py", line 136, in execute current_hash = log_provider.get_log_hash(task, logtype) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/services/ui_backend_service/data/cache/get_log_file_action.py", line 270, in get_log_hash return get_log_size(task, logtype) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/services/ui_backend_service/data/cache/get_log_file_action.py", line 177, in get_log_size return task.stderr_size if logtype == STDERR else task.stdout_size ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/client/core.py", line 1317, in stdout_size return self._get_logsize("stdout") ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/client/core.py", line 1433, in _get_logsize meta_dict = self.metadata_dict ^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/client/core.py", line 1135, in metadata_dict m.name: m.value for m in sorted(self.metadata, key=lambda m: m.created_at) ^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/client/core.py", line 1059, in metadata all_metadata = self._metaflow.metadata.get_object( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/metadata/metadata.py", line 425, in get_object pre_filter = cls._get_object_internal( ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/plugins/metadata/service.py", line 280, in _get_objectinternal v, = cls._request(None, url, "GET") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/site-packages/metaflow/plugins/metadata/service.py", line 468, in _request raise ServiceException(

metaflow.plugins.metadata.service.ServiceException: Metadata request (/flows/ParquetCheck/runs/argo-parquetcheck.user.zhangxinyu19.parquetcheck-g8vjm/steps/start/tasks/t-2aa87376/metadata) failed (code 500): "{\"err_msg\": {\"type\": \"timeout error\"}}"

LennieGuy commented 1 year ago

@jfernandez need your help

image
savingoyal commented 1 year ago

@LennieGuy what's the output of

from metaflow import namespace, Task

namespace(None)
Task('ParquetCheck/argo-parquetcheck.user.zhangxinyu19.parquetcheck-g8vjm/start/t-2aa87376').metadata_dict
LennieGuy commented 1 year ago

image

LennieGuy commented 1 year ago

@savingoyal This method of use will report the same error, is the database problem? I checked and found another two million in the database