HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name'

pborgesdbx commented 1 year ago

I am getting a validation error on CMD 11:

from training.generate import generate_response, load_model_tokenizer_for_generate

model, tokenizer = load_model_tokenizer_for_generate(local_output_dir)

Here's the traceback:

---------------------------------------------------------------------------
HFValidationError                         Traceback (most recent call last)
File <command-597306789744509>:3
      1 from training.generate import generate_response, load_model_tokenizer_for_generate
----> 3 model, tokenizer = load_model_tokenizer_for_generate(local_output_dir, )

File /Workspace/Repos/paulo.borges@databricks.com/dolly/training/generate.py:36, in load_model_tokenizer_for_generate(pretrained_model_name_or_path)
     25 def load_model_tokenizer_for_generate(
     26     pretrained_model_name_or_path: str,
     27 ) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
     28     """Loads the model and tokenizer so that it can be used for generating responses.
     29 
     30     Args:
   (...)
     34         Tuple[PreTrainedModel, PreTrainedTokenizer]: model and tokenizer
     35     """
---> 36     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, padding_side="left")
     37     model = AutoModelForCausalLM.from_pretrained(
     38         pretrained_model_name_or_path, device_map="auto", trust_remote_code=True
     39     )
     40     return model, tokenizer

File /databricks/python/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:582, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    579     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    581 # Next, let's try to use the tokenizer_config file to get the tokenizer class.
--> 582 tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
    583 if "_commit_hash" in tokenizer_config:
    584     kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]

File /databricks/python/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:433, in get_tokenizer_config(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, use_auth_token, revision, local_files_only, subfolder, **kwargs)
    371 """
    372 Loads the tokenizer configuration from a pretrained model tokenizer configuration.
    373 
   (...)
    430 tokenizer_config = get_tokenizer_config("tokenizer-test")
    431 ```"""
    432 commit_hash = kwargs.get("_commit_hash", None)
--> 433 resolved_config_file = cached_file(
    434     pretrained_model_name_or_path,
    435     TOKENIZER_CONFIG_FILE,
    436     cache_dir=cache_dir,
    437     force_download=force_download,
    438     resume_download=resume_download,
    439     proxies=proxies,
    440     use_auth_token=use_auth_token,
    441     revision=revision,
    442     local_files_only=local_files_only,
    443     subfolder=subfolder,
    444     _raise_exceptions_for_missing_entries=False,
    445     _raise_exceptions_for_connection_errors=False,
    446     _commit_hash=commit_hash,
    447 )
    448 if resolved_config_file is None:
    449     logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")

File /databricks/python/lib/python3.9/site-packages/transformers/utils/hub.py:409, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, use_auth_token, revision, local_files_only, subfolder, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash)
    406 user_agent = http_user_agent(user_agent)
    407 try:
    408     # Load from URL or cache if already cached
--> 409     resolved_file = hf_hub_download(
    410         path_or_repo_id,
    411         filename,
    412         subfolder=None if len(subfolder) == 0 else subfolder,
    413         revision=revision,
    414         cache_dir=cache_dir,
    415         user_agent=user_agent,
    416         force_download=force_download,
    417         proxies=proxies,
    418         resume_download=resume_download,
    419         use_auth_token=use_auth_token,
    420         local_files_only=local_files_only,
    421     )
    423 except RepositoryNotFoundError:
    424     raise EnvironmentError(
    425         f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
    426         "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
    427         "pass a token having permission to this repo with `use_auth_token` or log in with "
    428         "`huggingface-cli login` and pass `use_auth_token=True`."
    429     )

File /databricks/python/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    109 for arg_name, arg_value in chain(
    110     zip(signature.parameters, args),  # Args values
    111     kwargs.items(),  # Kwargs values
    112 ):
    113     if arg_name == "repo_id":
--> 114         validate_repo_id(arg_value)
    116     elif arg_name == "token" and arg_value is not None:
    117         has_token = True

File /databricks/python/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:166, in validate_repo_id(repo_id)
    161     raise HFValidationError(
    162         f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'."
    163     )
    165 if repo_id.count("/") > 1:
--> 166     raise HFValidationError(
    167         "Repo id must be in the form 'repo_name' or 'namespace/repo_name':"
    168         f" '{repo_id}'. Use `repo_type` argument if needed."
    169     )
    171 if not REPO_ID_REGEX.match(repo_id):
    172     raise HFValidationError(
    173         "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
    174         " forbidden, '-' and '.' cannot start or end the name, max length is 96:"
    175         f" '{repo_id}'."
    176     )

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/root/dolly_training/dolly__2023-03-30T01:11:56'. Use `repo_type` argument if needed.

Error is occurring for both:

model, tokenizer = load_model_tokenizer_for_generate(local_output_dir) # local_output_dir  = /root/dolly_training/dolly__2023-03-30T01:11:56

model, tokenizer = load_model_tokenizer_for_generate(dbfs_output_dir) # dbfs_output_dir = /dbfs/dolly_training/dolly__2023-03-30T01:11:56

Cluster config:

{
    "autoscale": {
        "min_workers": 2,
        "max_workers": 8
    },
    "cluster_name": "Dolly Cluster",
    "spark_version": "12.2.x-gpu-ml-scala2.12",
    "spark_conf": {},
    "aws_attributes": {
        "first_on_demand": 1,
        "availability": "SPOT_WITH_FALLBACK",
        "zone_id": "auto",
        "spot_bid_price_percent": 100,
        "ebs_volume_count": 0
    },
    "node_type_id": "g4dn.xlarge",
    "driver_node_type_id": "g4dn.xlarge",
    "ssh_public_keys": [],
    "custom_tags": {},
    "spark_env_vars": {},
    "autotermination_minutes": 120,
    "enable_elastic_disk": false,
    "cluster_source": "UI",
    "init_scripts": [],
    "single_user_name": "paulo.borges@databricks.com",
    "enable_local_disk_encryption": false,
    "data_security_mode": "SINGLE_USER",
    "runtime_engine": "STANDARD",
    "cluster_id": "0329-202609-84uf8huw"
}

srowen commented 1 year ago

@matthayes I wonder if you've seen that? the code looks right, wondering if the colons in the filename are somehow making it look like a URL and/or that's an HF bug. But maybe you can confirm/deny that the code with those types of path should be working

zcking commented 1 year ago

I am having this issue as well. I will try changing the timestamp definition to remove the colons.

timestamp = datetime.now().strftime("%Y-%m-%dT%H%M%S")

zcking commented 1 year ago

Removing colons from the path does not work. I also tried it with a local path instead of the absolute path.

Using the stacktrace I believe the huggingface transformers code that should determine it is a local path is found here: https://github.com/huggingface/transformers/blob/v4.27.4/src/transformers/utils/hub.py#L376

Specifically it is checking that the path is a directory, using os.path.isdir(path_or_repo_id)

From that, and based on the docstring for AutoTokenizer.from_pretrained() it should accept these types of paths:

    Params:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            Can be either:

                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                  user or organization name, like `dbmdz/bert-base-german-cased`.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                  using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                  single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                  applicable to all derived classes)

matthayes commented 1 year ago

Have you checked if the path /root/dolly_training/dolly__2023-03-30T01:11:56 was created successfully? Looking at the code linked by @zcking, it appears that the directory may not exist.

matthayes commented 1 year ago

Also can you confirm that training succeeded? This could be another reason why the path doesn't exist.

pborgesdbx commented 1 year ago

I think Matt has a point; please see the training trace:

2023-03-30 22:09:13 INFO [root] Exception while sending command.
Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 503, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 506, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending
[2023-03-30 22:09:18,392] [WARNING] [runner.py:186:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2023-03-30 22:09:18,402] [INFO] [runner.py:548:main] cmd = /local_disk0/.ephemeral_nfs/envs/pythonEnv-75c87d05-950d-4b4f-afed-90d6fb141b40/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --module --enable_each_rank_log=None training.trainer --deepspeed /Workspace/Repos/paulo.borges@databricks.com/dolly/config/ds_z3_bf16_config.json --epochs 1 --local-output-dir /root/dolly_training/dolly__2023-03-30T22:09:08 --dbfs-output-dir /dbfs/dolly_training/dolly__2023-03-30T22:09:08 --per-device-train-batch-size 8 --per-device-eval-batch-size 8 --lr 1e-5
[2023-03-30 22:09:22,283] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0]}
[2023-03-30 22:09:22,284] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=1, node_rank=0
[2023-03-30 22:09:22,284] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})
[2023-03-30 22:09:22,284] [INFO] [launch.py:162:main] dist_world_size=1
[2023-03-30 22:09:22,284] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0
2023-03-30 22:09:24.104552: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 22:09:24.241381: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-30 22:09:33 INFO [__main__] Loading tokenizer for EleutherAI/gpt-j-6B
2023-03-30 22:09:33 INFO [__main__] Loading model for EleutherAI/gpt-j-6B
[2023-03-30 22:11:39,404] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 4322
[2023-03-30 22:11:39,434] [ERROR] [launch.py:324:sigkill_handler] ['/local_disk0/.ephemeral_nfs/envs/pythonEnv-75c87d05-950d-4b4f-afed-90d6fb141b40/bin/python', '-u', '-m', 'training.trainer', '--local_rank=0', '--deepspeed', '/Workspace/Repos/paulo.borges@databricks.com/dolly/config/ds_z3_bf16_config.json', '--epochs', '1', '--local-output-dir', '/root/dolly_training/dolly__2023-03-30T22:09:08', '--dbfs-output-dir', '/dbfs/dolly_training/dolly__2023-03-30T22:09:08', '--per-device-train-batch-size', '8', '--per-device-eval-batch-size', '8', '--lr', '1e-5'] exits with return code = -9

When I run

%ls /dbfs/dolly_training

I don't see the dolly__2023-03-30T22:09:08 directory.

matthayes commented 1 year ago

It appears to be crashing while loading the model. Maybe OOM? What machine type are you using? It’d help if we check the path exists after training and provide a more user friendly message. I can make that update.

pborgesdbx commented 1 year ago

It's an OOM issue, I upgraded to the following and it's now training:

{
    "num_workers": 0,
    "cluster_name": "LLM Cluster",
    "spark_version": "12.2.x-gpu-ml-scala2.12",
    "spark_conf": {
        "spark.databricks.cluster.profile": "singleNode",
        "spark.master": "local[*, 4]"
    },
    "aws_attributes": {
        "first_on_demand": 1,
        "availability": "SPOT_WITH_FALLBACK",
        "zone_id": "auto",
        "spot_bid_price_percent": 100,
        "ebs_volume_count": 0
    },
    "node_type_id": "g5.48xlarge",
    "driver_node_type_id": "g5.48xlarge",
    "ssh_public_keys": [],
    "custom_tags": {
        "ResourceClass": "SingleNode"
    },
    "spark_env_vars": {
        "PYSPARK_PYTHON": "/databricks/python3/bin/python3"
    },
    "autotermination_minutes": 120,
    "enable_elastic_disk": true,
    "cluster_source": "UI",
    "init_scripts": [],
    "single_user_name": "paulo.borges@databricks.com",
    "enable_local_disk_encryption": false,
    "data_security_mode": "SINGLE_USER",
    "runtime_engine": "STANDARD",
    "cluster_id": "0331-003509-44f5i1om"
}

Umesh1307 commented 1 year ago

I'm Trying to train

Model Type: EleutherAI/pythia-2.8b

Error Type: [HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name']

dolly2_hf

I Believe this cluster configuration is capable enough to train this model

Compute Detail: { "num_workers": 0, "cluster_name": "DollyPOCCluster", "spark_version": "12.2.x-gpu-ml-scala2.12", "spark_conf": { "spark.master": "local[*, 4]", "spark.databricks.cluster.profile": "singleNode" }, "aws_attributes": { "first_on_demand": 1, "availability": "SPOT_WITH_FALLBACK", "zone_id": "auto", "spot_bid_price_percent": 100, "ebs_volume_count": 0 }, "node_type_id": "g4dn.2xlarge", "driver_node_type_id": "g4dn.2xlarge", "ssh_public_keys": [], "custom_tags": { "ResourceClass": "SingleNode" }, "spark_env_vars": {}, "autotermination_minutes": 20, "enable_elastic_disk": true, "cluster_source": "UI", "init_scripts": [], "enable_local_disk_encryption": false, "data_security_mode": "NONE", "runtime_engine": "STANDARD", "cluster_id": "0517-050920-dmg5higv" }

RUN LOGS: [2023-05-17 10:15:28,465] [WARNING] [runner.py:186:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2023-05-17 10:15:28,473] [INFO] [runner.py:550:main] cmd = /local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --module --enable_each_rank_log=None training.trainer --input-model EleutherAI/pythia-2.8b --deepspeed /Workspace/Repos/dolly/config/ds_z3_bf16_config.json --epochs 2 --local-output-dir /local_disk0/dolly_training/dolly2023-05-17T10-15-18 --dbfs-output-dir /dbfs/dolly_training/dolly__2023-05-17T10-15-18 --per-device-train-batch-size 6 --per-device-eval-batch-size 6 --logging-steps 10 --save-steps 200 --save-total-limit 20 --eval-steps 50 --warmup-steps 50 --test-size 200 --lr 5e-6 [2023-05-17 10:15:32,322] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0]} [2023-05-17 10:15:32,323] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=1, node_rank=0 [2023-05-17 10:15:32,323] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]}) [2023-05-17 10:15:32,323] [INFO] [launch.py:162:main] dist_world_size=1 [2023-05-17 10:15:32,323] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0 2023-05-17 10:15:34.898239: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-05-17 10:15:35.037391: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable TF_ENABLE_ONEDNN_OPTS=0. 2023-05-17 10:15:43 INFO [main] Loading tokenizer for EleutherAI/pythia-2.8b Downloading (…)okenizer_config.json: 100%|█████| 396/396 [00:00<00:00, 59.8kB/s] Downloading (…)/main/tokenizer.json: 100%|█| 2.11M/2.11M [00:00<00:00, 6.89MB/s] Downloading (…)cial_tokens_map.json: 100%|███| 99.0/99.0 [00:00<00:00, 59.9kB/s] 2023-05-17 10:15:45 INFO [main] Loading model for EleutherAI/pythia-2.8b Downloading (…)lve/main/config.json: 100%|██████| 571/571 [00:00<00:00, 339kB/s] Downloading pytorch_model.bin: 100%|████████| 5.68G/5.68G [00:31<00:00, 182MB/s] 2023-05-17 10:16:48 INFO [main] Found max lenth: 2048 2023-05-17 10:16:48 INFO [main] Loading dataset from /dbfs/FileStore/tables/clinical_dolly.jsonl 2023-05-17 10:16:49 WARNING [datasets.builder] Found cached dataset json (/root/.cache/huggingface/datasets/json/clinical_dolly.jsonl-764c9aeb7bf7fac0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 543.51it/s] 2023-05-17 10:16:49 INFO [main] Found 26 rows 2023-05-17 10:16:49 INFO [main] Preprocessing dataset
2023-05-17 10:16:49 INFO [main] Processed dataset has 26 rows
2023-05-17 10:16:49 INFO [main] Processed dataset has 26 rows after filtering for truncated records 2023-05-17 10:16:49 INFO [main] Shuffling dataset 2023-05-17 10:16:49 INFO [main] Done preprocessing 2023-05-17 10:16:49 ERROR [main] main failed Traceback (most recent call last): File "/Workspace/Repos/dolly/training/trainer.py", line 329, in main() File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1128, in call return self.main(*args, kwargs) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1053, in main rv = self.invoke(ctx) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1395, in invoke return ctx.invoke(self.callback, ctx.params) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 754, in invoke return callback(args, kwargs) File "/Workspace/Repos/dolly/training/trainer.py", line 321, in main train(kwargs) File "/Workspace/Repos/dolly/training/trainer.py", line 226, in train split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 543, in wrapper out: Union["Dataset", "DatasetDict"] = func(self, args, kwargs) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/fingerprint.py", line 511, in wrapper out = func(dataset, *args, *kwargs) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 4365, in train_test_split raise ValueError( ValueError: test_size=200 should be either positive and smaller than the number of samples 26 or a float in the (0, 1) range Traceback (most recent call last): File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/Workspace/Repos/dolly/training/trainer.py", line 329, in main() File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1128, in call return self.main(args, kwargs) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1053, in main rv = self.invoke(ctx) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 1395, in invoke return ctx.invoke(self.callback, ctx.params) File "/databricks/python/lib/python3.9/site-packages/click/core.py", line 754, in invoke return __callback(args, kwargs) File "/Workspace/Repos/dolly/training/trainer.py", line 321, in main train(kwargs) File "/Workspace/Repos/dolly/training/trainer.py", line 226, in train split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 543, in wrapper out: Union["Dataset", "DatasetDict"] = func(self, args, kwargs) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/fingerprint.py", line 511, in wrapper out = func(dataset, *args, **kwargs) File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-07b069a9-fc74-46da-b629-0e873bf200ec/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 4365, in train_test_split raise ValueError( ValueError: test_size=200 should be either positive and smaller than the number of samples 26 or a float in the (0, 1) range [2023-05-17 10:16:52,394] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 8886

srowen commented 1 year ago

Looks like you did not finish training or ran OOM. Search similar issues here

databrickslabs / dolly

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name' #36