Closed nuwanq closed 6 months ago
Thank you @nuwanq for reporting this, we have created a fix for this which you can find here #2751
@safoinme. Thank you for fixing it.
@safoinme I think now it's introduced another bug when using S3 as ARTIFACT_STORE
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /mnt/ssd/projects/events/src/debug_train.py:101 in <module> │
│ │
│ 98 │
│ 99 │
│ 100 if __name__ == "__main__": │
│ ❱ 101 │ training_pipeline() │
│ 102 │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/new/pipelines/pipeline.py:1397 in │
│ __call__ │
│ │
│ 1394 │ │ │ return self.entrypoint(*args, **kwargs) │
│ 1395 │ │ │
│ 1396 │ │ self.prepare(*args, **kwargs) │
│ ❱ 1397 │ │ return self._run(**self._run_args) │
│ 1398 │ │
│ 1399 │ def _call_entrypoint(self, *args: Any, **kwargs: Any) -> None: │
│ 1400 │ │ """Calls the pipeline entrypoint function with the given arguments. │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/new/pipelines/pipeline.py:758 in _run │
│ │
│ 755 │ │ │ │ │ │ "`zenml up`." │
│ 756 │ │ │ │ │ ) │
│ 757 │ │ │ │
│ ❱ 758 │ │ │ deploy_pipeline( │
│ 759 │ │ │ │ deployment=deployment_model, stack=stack, placeholder_run=run │
│ 760 │ │ │ ) │
│ 761 │ │ │ if run: │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/new/pipelines/run_utils.py:148 in │
│ deploy_pipeline │
│ │
│ 145 │ │ │ # placeholder run to stay in the database │
│ 146 │ │ │ Client().delete_pipeline_run(placeholder_run.id) │
│ 147 │ │ │
│ ❱ 148 │ │ raise e │
│ 149 │ finally: │
│ 150 │ │ constants.SHOULD_PREVENT_PIPELINE_EXECUTION = previous_value │
│ 151 │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/new/pipelines/run_utils.py:136 in │
│ deploy_pipeline │
│ │
│ 133 │ previous_value = constants.SHOULD_PREVENT_PIPELINE_EXECUTION │
│ 134 │ constants.SHOULD_PREVENT_PIPELINE_EXECUTION = True │
│ 135 │ try: │
│ ❱ 136 │ │ stack.deploy_pipeline(deployment=deployment) │
│ 137 │ except Exception as e: │
│ 138 │ │ if ( │
│ 139 │ │ │ placeholder_run │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/stack/stack.py:853 in deploy_pipeline │
│ │
│ 850 │ │ Returns: │
│ 851 │ │ │ The return value of the call to `orchestrator.run_pipeline(...)`. │
│ 852 │ │ """ │
│ ❱ 853 │ │ return self.orchestrator.run(deployment=deployment, stack=self) │
│ 854 │ │
│ 855 │ def _get_active_components_for_step( │
│ 856 │ │ self, step_config: "StepConfiguration" │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/base_orchestrator.py:175 │
│ in run │
│ │
│ 172 │ │ environment = get_config_environment_vars(deployment=deployment) │
│ 173 │ │ │
│ 174 │ │ try: │
│ ❱ 175 │ │ │ result = self.prepare_or_run_pipeline( │
│ 176 │ │ │ │ deployment=deployment, stack=stack, environment=environment │
│ 177 │ │ │ ) │
│ 178 │ │ finally: │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/local/local_orchestrator. │
│ py:78 in prepare_or_run_pipeline │
│ │
│ 75 │ │ │ │ │ step_name, │
│ 76 │ │ │ │ ) │
│ 77 │ │ │ │
│ ❱ 78 │ │ │ self.run_step( │
│ 79 │ │ │ │ step=step, │
│ 80 │ │ │ ) │
│ 81 │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/base_orchestrator.py:195 │
│ in run_step │
│ │
│ 192 │ │ │ step=step, │
│ 193 │ │ │ orchestrator_run_id=self.get_orchestrator_run_id(), │
│ 194 │ │ ) │
│ ❱ 195 │ │ launcher.launch() │
│ 196 │ │
│ 197 │ @staticmethod │
│ 198 │ def requires_resources_in_orchestration_environment( │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_launcher.py:250 in │
│ launch │
│ │
│ 247 │ │ │ │ │ while retries < max_retries: │
│ 248 │ │ │ │ │ │ last_retry = retries == max_retries - 1 │
│ 249 │ │ │ │ │ │ try: │
│ ❱ 250 │ │ │ │ │ │ │ self._run_step( │
│ 251 │ │ │ │ │ │ │ │ pipeline_run=pipeline_run, │
│ 252 │ │ │ │ │ │ │ │ step_run=step_run_response, │
│ 253 │ │ │ │ │ │ │ │ last_retry=last_retry, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_launcher.py:451 in │
│ _run_step │
│ │
│ 448 │ │ │ │ │ last_retry=last_retry, │
│ 449 │ │ │ │ ) │
│ 450 │ │ │ else: │
│ ❱ 451 │ │ │ │ self._run_step_without_step_operator( │
│ 452 │ │ │ │ │ pipeline_run=pipeline_run, │
│ 453 │ │ │ │ │ step_run=step_run, │
│ 454 │ │ │ │ │ step_run_info=step_run_info, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_launcher.py:535 in │
│ _run_step_without_step_operator │
│ │
│ 532 │ │ if last_retry: │
│ 533 │ │ │ os.environ[ENV_ZENML_IGNORE_FAILURE_HOOK] = "false" │
│ 534 │ │ runner = StepRunner(step=self._step, stack=self._stack) │
│ ❱ 535 │ │ runner.run( │
│ 536 │ │ │ pipeline_run=pipeline_run, │
│ 537 │ │ │ step_run=step_run, │
│ 538 │ │ │ input_artifacts=input_artifacts, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_runner.py:189 in run │
│ │
│ 186 │ │ │ │ self._prepare_model_context_for_step() │
│ 187 │ │ │ │ │
│ 188 │ │ │ │ # Parse the inputs for the entrypoint function. │
│ ❱ 189 │ │ │ │ function_params = self._parse_inputs( │
│ 190 │ │ │ │ │ args=spec.args, │
│ 191 │ │ │ │ │ annotations=spec.annotations, │
│ 192 │ │ │ │ │ input_artifacts=input_artifacts, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_runner.py:355 in │
│ _parse_inputs │
│ │
│ 352 │ │ │ │ ) │
│ 353 │ │ │ │ function_params[arg] = get_step_context() │
│ 354 │ │ │ elif arg in input_artifacts: │
│ ❱ 355 │ │ │ │ function_params[arg] = self._load_input_artifact( │
│ 356 │ │ │ │ │ input_artifacts[arg], arg_type │
│ 357 │ │ │ │ ) │
│ 358 │ │ │ elif arg in self.configuration.parameters: │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/orchestrators/step_runner.py:458 in │
│ _load_input_artifact │
│ │
│ 455 │ │ ) │
│ 456 │ │ materializer: BaseMaterializer = materializer_class(artifact.uri) │
│ 457 │ │ materializer.validate_type_compatibility(data_type) │
│ ❱ 458 │ │ return materializer.load(data_type=data_type) │
│ 459 │ │
│ 460 │ def _validate_outputs( │
│ 461 │ │ self, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/zenml/integrations/huggingface/materializers/ │
│ huggingface_tokenizer_materializer.py:58 in load │
│ │
│ 55 │ │ │
│ 56 │ │ print(os.path.join(self.uri, DEFAULT_TOKENIZER_DIR)) │
│ 57 │ │ │
│ ❱ 58 │ │ return AutoTokenizer.from_pretrained( │
│ 59 │ │ │ os.path.join(self.uri, DEFAULT_TOKENIZER_DIR), │
│ 60 │ │ ) │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py │
│ :652 in from_pretrained │
│ │
│ 649 │ │ │ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input │
│ 650 │ │ │
│ 651 │ │ # Next, let's try to use the tokenizer_config file to get the tokenizer class. │
│ ❱ 652 │ │ tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) │
│ 653 │ │ if "_commit_hash" in tokenizer_config: │
│ 654 │ │ │ kwargs["_commit_hash"] = tokenizer_config["_commit_hash"] │
│ 655 │ │ config_tokenizer_class = tokenizer_config.get("tokenizer_class") │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py │
│ :496 in get_tokenizer_config │
│ │
│ 493 │ tokenizer_config = get_tokenizer_config("tokenizer-test") │
│ 494 │ ```""" │
│ 495 │ commit_hash = kwargs.get("_commit_hash", None) │
│ ❱ 496 │ resolved_config_file = cached_file( │
│ 497 │ │ pretrained_model_name_or_path, │
│ 498 │ │ TOKENIZER_CONFIG_FILE, │
│ 499 │ │ cache_dir=cache_dir, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/transformers/utils/hub.py:417 in cached_file │
│ │
│ 414 │ user_agent = http_user_agent(user_agent) │
│ 415 │ try: │
│ 416 │ │ # Load from URL or cache if already cached │
│ ❱ 417 │ │ resolved_file = hf_hub_download( │
│ 418 │ │ │ path_or_repo_id, │
│ 419 │ │ │ filename, │
│ 420 │ │ │ subfolder=None if len(subfolder) == 0 else subfolder, │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:106 in │
│ _inner_fn │
│ │
│ 103 │ │ │ kwargs.items(), # Kwargs values │
│ 104 │ │ ): │
│ 105 │ │ │ if arg_name in ["repo_id", "from_id", "to_id"]: │
│ ❱ 106 │ │ │ │ validate_repo_id(arg_value) │
│ 107 │ │ │ │
│ 108 │ │ │ elif arg_name == "token" and arg_value is not None: │
│ 109 │ │ │ │ has_token = True │
│ │
│ /mnt/ssd/mamba/envs/x/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:154 in │
│ validate_repo_id │
│ │
│ 151 │ │ raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_ │
│ 152 │ │
│ 153 │ if repo_id.count("/") > 1: │
│ ❱ 154 │ │ raise HFValidationError( │
│ 155 │ │ │ "Repo id must be in the form 'repo_name' or 'namespace/repo_name':" │
│ 156 │ │ │ f" '{repo_id}'. Use `repo_type` argument if needed." │
│ 157 │ │ ) │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name':
's3://xxx-xxx-xxx/tokenizer_loader/tokenizer/xxac9ce6-xxxx-xxxx-xxxx-00xxxxx/xxxxxx/hf_tokenizer'. Use `repo_type` argument if needed.
@nuwanq I have just taken another look and tested this with GCP and s3 and actually the implementation was causing an error related to the fact that hugging face can not access the artifact store, The code is updated to fix this issue
@safoinme , Thanks for the quick work.
System Information
ZENML_LOCAL_VERSION: 0.57.1 ZENML_SERVER_VERSION: 0.57.1 ZENML_SERVER_DATABASE: sqlite ZENML_SERVER_DEPLOYMENT_TYPE: other ZENML_CONFIG_DIR: /root/.config/zenml ZENML_LOCAL_STORE_DIR: /root/.config/zenml/local_stores ZENML_SERVER_URL: sqlite:////root/.config/zenml/local_stores/default_zen_store/zenml.db ZENML_ACTIVE_REPOSITORY_ROOT: None PYTHON_VERSION: 3.10.0 ENVIRONMENT: native SYSTEM_INFO: {'os': 'linux', 'linux_distro': 'ubuntu', 'linux_distro_like': 'debian', 'linux_distro_version': '22.04'} ACTIVE_WORKSPACE: default ACTIVE_STACK: default ACTIVE_USER: default TELEMETRY_STATUS: enabled ANALYTICS_CLIENT_ID: c78fc8ea-738d-4d37-8ba9-aec7ebe46f5c ANALYTICS_USER_ID: 55e2f3f9-af52-4126-82a6-1c083fcda7c5 ANALYTICS_SERVER_ID: c78fc8ea-738d-4d37-8ba9-aec7ebe46f5c INTEGRATIONS: ['aws', 'bentoml', 'bitbucket', 'huggingface', 'kaniko', 'mlflow', 'pillow', 'pytorch', 's3', 'scipy', 'sklearn', 'slack'] PACKAGES: {'argon2-cffi': '21.3.0', 'argon2-cffi-bindings': '21.2.0', 'awscli': '1.32.3', 'bamboolib': '1.30.16', 'defusedxml': '0.7.1', 'docutils': '0.16', 'ipyslickgrid': '0.0.3', 'ipython-genutils': '0.2.0', 'jupyterlab-pygments': '0.2.2', 'mypy': '1.3.0', 'mypy-extensions': '1.0.0', 'pandocfilters': '1.5.0', 'plotly': '5.10.0', 'ppscore': '1.2.0', 'rsa': '4.7.2', 'xlrd': '2.0.1', 'brotli': '1.1.0', 'deprecated': '1.2.14', 'gitpython': '3.1.43', 'mako': '1.3.5', 'markdown': '3.6', 'markupsafe': '2.1.5', 'pyjwt': '2.7.0', 'pymysql': '1.0.3', 'pyyaml': '6.0.1', 'sqlalchemy': '1.4.41', 'sqlalchemy-utils': '0.38.3', 'absl-py': '2.1.0', 'accelerate': '0.30.1', 'aiobotocore': '2.7.0', 'aiofiles': '23.2.1', 'aiohttp': '3.9.5', 'aiohttp-cors': '0.7.0', 'aioitertools': '0.11.0', 'aiokafka': '0.10.0', 'aiosignal': '1.3.1', 'alembic': '1.8.1', 'altair': '5.3.0', 'aniso8601': '9.0.1', 'annotated-types': '0.7.0', 'anyio': '4.3.0', 'appdirs': '1.4.4', 'argparse': '1.4.0', 'asgiref': '3.8.1', 'asttokens': '2.4.1', 'async-timeout': '4.0.3', 'attrs': '22.2.0', 'aws-profile-manager': '0.7.3', 'azure-common': '1.1.28', 'azure-core': '1.30.1', 'azure-mgmt-core': '1.4.0', 'azure-mgmt-resource': '23.1.1', 'bcrypt': '4.0.1', 'bentoml': '1.2.16', 'bert-score': '0.3.13', 'blinker': '1.8.2', 'boto3': '1.28.64', 'botocore': '1.31.64', 'build': '1.2.1', 'cachetools': '5.3.3', 'cattrs': '23.1.2', 'certifi': '2024.2.2', 'cffi': '1.16.0', 'charset-normalizer': '3.3.2', 'circus': '0.18.0', 'click': '8.1.3', 'click-option-group': '0.5.6', 'click-params': '0.3.0', 'cloudpickle': '2.2.1', 'colorama': '0.4.6', 'coloredlogs': '15.0.1', 'colorful': '0.5.6', 'comm': '0.2.2', 'configparser': '7.0.0', 'contourpy': '1.2.1', 'cryptography': '42.0.7', 'cycler': '0.12.1', 'dataclasses-json': '0.6.6', 'datasets': '2.19.1', 'debugpy': '1.8.1', 'decorator': '5.1.1', 'deepmerge': '1.1.1', 'dill': '0.3.8', 'distlib': '0.3.8', 'distro': '1.9.0', 'dnspython': '2.6.1', 'docker': '6.1.3', 'email-validator': '2.1.1', 'entrypoints': '0.4', 'evaluate': '0.4.2', 'exceptiongroup': '1.2.0', 'executing': '2.0.1', 'fastapi': '0.89.1', 'fastapi-cli': '0.0.4', 'fastapi-utils': '0.2.1', 'fastt5': '0.0.5', 'ffmpy': '0.3.2', 'filelock': '3.14.0', 'flask': '3.0.3', 'flatbuffers': '24.3.25', 'fonttools': '4.51.0', 'frozenlist': '1.4.1', 'fs': '2.4.16', 'fsspec': '2023.10.0', 'gevent': '24.2.1', 'geventhttpclient': '2.0.2', 'gitdb': '4.0.11', 'google-api-core': '2.19.0', 'google-auth': '2.29.0', 'google-pasta': '0.2.0', 'googleapis-common-protos': '1.63.0', 'gradio': '3.50.2', 'gradio-client': '0.6.1', 'graphene': '3.3', 'graphql-core': '3.2.3', 'graphql-relay': '3.2.0', 'greenlet': '3.0.3', 'grpcio': '1.64.0', 'gunicorn': '21.2.0', 'h11': '0.14.0', 'httpcore': '1.0.5', 'httplib2': '0.19.1', 'httptools': '0.6.1', 'httpx': '0.27.0', 'huggingface': '0.0.1', 'huggingface-hub': '0.23.0', 'humanfriendly': '10.0', 'icecream': '2.1.3', 'idna': '3.7', 'importlib-metadata': '4.13.0', 'importlib-resources': '6.4.0', 'inflection': '0.5.1', 'ipinfo': '5.0.1', 'ipykernel': '6.29.3', 'ipython': '8.24.0', 'ipywidgets': '8.1.2', 'isodate': '0.6.1', 'itsdangerous': '2.2.0', 'jedi': '0.19.1', 'jinja2': '3.1.4', 'jmespath': '1.0.1', 'joblib': '1.4.2', 'jsonpatch': '1.33', 'jsonpointer': '2.4', 'jsonschema': '4.22.0', 'jsonschema-specifications': '2023.12.1', 'jupyter-client': '8.6.1', 'jupyter-core': '5.7.2', 'jupyterlab-widgets': '3.0.10', 'kiwisolver': '1.4.5', 'kubernetes': '29.0.0', 'langchain': '0.2.0', 'langchain-community': '0.2.0', 'langchain-core': '0.2.1', 'langchain-text-splitters': '0.2.0', 'langsmith': '0.1.61', 'linkify-it-py': '2.0.3', 'lxml': '5.2.2', 'markdown-it-py': '3.0.0', 'marshmallow': '3.21.2', 'matplotlib': '3.9.0', 'matplotlib-inline': '0.1.7', 'mdit-py-plugins': '0.4.1', 'mdurl': '0.1.2', 'memray': '1.12.0', 'mlflow': '2.12.1', 'mlserver': '1.3.5', 'mlserver-mlflow': '1.5.0', 'mpmath': '1.3.0', 'msgpack': '1.0.8', 'multidict': '6.0.5', 'multiprocess': '0.70.16', 'nest-asyncio': '1.6.0', 'networkx': '3.3', 'nltk': '3.8.1', 'numpy': '1.26.4', 'nvidia-cublas-cu12': '12.1.3.1', 'nvidia-cuda-cupti-cu12': '12.1.105', 'nvidia-cuda-nvrtc-cu12': '12.1.105', 'nvidia-cuda-runtime-cu12': '12.1.105', 'nvidia-cudnn-cu12': '8.9.2.26', 'nvidia-cufft-cu12': '11.0.2.54', 'nvidia-curand-cu12': '10.3.2.106', 'nvidia-cusolver-cu12': '11.4.5.107', 'nvidia-cusparse-cu12': '12.1.0.106', 'nvidia-ml-py': '11.525.150', 'nvidia-nccl-cu12': '2.20.5', 'nvidia-nvjitlink-cu12': '12.4.127', 'nvidia-nvtx-cu12': '12.1.105', 'oauthlib': '3.2.2', 'onnx': '1.16.1', 'onnxruntime': '1.18.0', 'opencensus': '0.11.4', 'opencensus-context': '0.1.3', 'opentelemetry-api': '1.20.0', 'opentelemetry-instrumentation': '0.41b0', 'opentelemetry-instrumentation-aiohttp-client': '0.41b0', 'opentelemetry-instrumentation-asgi': '0.41b0', 'opentelemetry-sdk': '1.20.0', 'opentelemetry-semantic-conventions': '0.41b0', 'opentelemetry-util-http': '0.41b0', 'optimum': '1.20.0.dev0', 'orjson': '3.10.3', 'packaging': '23.2', 'pandas': '2.2.2', 'parso': '0.8.4', 'passlib': '1.7.4', 'pathos': '0.3.2', 'pathspec': '0.12.1', 'pexpect': '4.9.0', 'pickleshare': '0.7.5', 'pillow': '10.3.0', 'pip': '24.0', 'pip-requirements-parser': '32.0.1', 'pip-tools': '7.4.1', 'platformdirs': '4.2.2', 'portalocker': '2.8.2', 'pox': '0.3.4', 'ppft': '1.7.6.8', 'progress': '1.6', 'prometheus-client': '0.20.0', 'prompt-toolkit': '3.0.42', 'proto-plus': '1.23.0', 'protobuf': '3.20.3', 'protobuf3-to-dict': '0.1.5', 'psutil': '5.9.8', 'ptyprocess': '0.7.0', 'pure-eval': '0.2.2', 'py-grpc-prometheus': '0.8.0', 'py-spy': '0.3.14', 'pyarrow': '15.0.2', 'pyarrow-hotfix': '0.6', 'pyasn1': '0.6.0', 'pyasn1-modules': '0.4.0', 'pycparser': '2.22', 'pydantic': '1.10.15', 'pydantic-core': '2.18.2', 'pydub': '0.25.1', 'pygments': '2.18.0', 'pyparsing': '2.4.7', 'pyproject-hooks': '1.1.0', 'python-dateutil': '2.9.0', 'python-dotenv': '1.0.1', 'python-json-logger': '2.0.7', 'python-multipart': '0.0.9', 'python-rapidjson': '1.14', 'pytz': '2024.1', 'pyzmq': '26.0.3', 'querystring-parser': '1.2.4', 'ray': '2.23.0', 'referencing': '0.35.1', 'regex': '2024.5.15', 'requests': '2.32.1', 'requests-oauthlib': '2.0.0', 'rich': '13.7.1', 'rouge-score': '0.1.2', 'rpds-py': '0.18.1', 'ruff': '0.4.4', 's3fs': '2023.10.0', 's3transfer': '0.7.0', 'sacrebleu': '2.4.2', 'safetensors': '0.4.3', 'sagemaker': '2.117.0', 'schema': '0.7.7', 'scikit-learn': '1.4.2', 'scipy': '1.13.0', 'secure': '0.3.0', 'semantic-version': '2.10.0', 'sentencepiece': '0.2.0', 'setuptools': '69.5.1', 'shellingham': '1.5.4', 'simple-di': '0.1.5', 'six': '1.16.0', 'slack-sdk': '3.27.2', 'smart-open': '7.0.4', 'smdebug-rulesconfig': '1.0.1', 'smmap': '5.0.1', 'sniffio': '1.3.1', 'sqlalchemy2-stubs': '0.0.2a38', 'sqlmodel': '0.0.8', 'sqlparse': '0.5.0', 'stack-data': '0.6.2', 'starlette': '0.37.2', 'starlette-exporter': '0.17.1', 'sympy': '1.12', 'tabulate': '0.9.0', 'tenacity': '8.3.0', 'textual': '0.63.4', 'threadpoolctl': '3.5.0', 'tokenizers': '0.13.3', 'tomli': '2.0.1', 'tomli-w': '1.0.0', 'tomlkit': '0.12.0', 'toolz': '0.12.1', 'torch': '2.3.0', 'torchvision': '0.18.0', 'tornado': '6.4', 'tqdm': '4.66.4', 'traitlets': '5.14.3', 'transformers': '4.31.0', 'triton': '2.3.0', 'tritonclient': '2.45.0', 'typer': '0.12.3', 'typing-extensions': '4.11.0', 'typing-inspect': '0.9.0', 'tzdata': '2024.1', 'uc-micro-py': '1.0.3', 'ujson': '5.10.0', 'urllib3': '2.0.7', 'uvicorn': '0.29.0', 'uvloop': '0.19.0', 'validators': '0.18.2', 'virtualenv': '20.26.2', 'watchfiles': '0.21.0', 'wcwidth': '0.2.13', 'websocket-client': '1.8.0', 'websockets': '11.0.3', 'werkzeug': '3.0.3', 'wheel': '0.43.0', 'widgetsnbextension': '4.0.10', 'wrapt': '1.16.0', 'xmltodict': '0.13.0', 'xxhash': '3.4.1', 'yarl': '1.9.4', 'zenml': '0.57.1', 'zipp': '3.17.0', 'zope.event': '5.0', 'zope.interface': '6.4'}
CURRENT STACK
Name: default ID: 80400df8-f083-451c-adb8-c428c1dac07f Workspace: default / dc072673-926e-4729-afc9-a1674661c86b
ORCHESTRATOR: default
Name: default ID: 88a7725b-e56d-4312-8011-2282281f93f8 Type: orchestrator Flavor: local Configuration: {} Workspace: default / dc072673-926e-4729-afc9-a1674661c86b
ARTIFACT_STORE: default
Name: default ID: ece5edbf-255e-4390-93ca-30465cface73 Type: artifact_store Flavor: local Configuration: {'path': ''} Workspace: default / dc072673-926e-4729-afc9-a1674661c86b
What happened?
There is a issue with mt5 and umt5(with sentencepiece installed) . When sentencepiece uninstalled this works fine for umt5.
Reproduction steps
from transformers import ( AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase, ) from typing_extensions import Annotated from zenml import ArtifactConfig, pipeline, step
MODEL_NAME = "google/umt5-small"
@step def tokenizer_loader() -> ( Annotated[ PreTrainedTokenizerBase, ArtifactConfig(name="tokenizer", is_model_artifact=True), ] ): return AutoTokenizer.from_pretrained(MODEL_NAME)
@step(enable_cache=False) def model_trainer( tokenizer: PreTrainedTokenizerBase, model_name: str = "google/umt5-small", ) -> Tuple[ Annotated[ PreTrainedModel, ArtifactConfig(name="model", is_model_artifact=True), ], Annotated[ PreTrainedTokenizerBase, ArtifactConfig(name="tokenizer", is_model_artifact=True), ], ]: model_config = AutoConfig.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=model_config) print("Now I am returning the model and tokenizer") print("---------------------------")
print(tokenizer)
@pipeline(enable_cache=False) def training_pipeline(): tokenizer = tokenizer_loader() model, tokenizer = model_trainer(tokenizer=tokenizer, model_name=MODEL_NAME)
if name == "main": training_pipeline()
Code of Conduct