jxmorris12 / vec2text

utilities for decoding deep representations (like sentence embeddings) back to text
Other
699 stars 76 forks source link

Problems encountered while replicating code #68

Closed jyyang26 closed 1 week ago

jyyang26 commented 1 week ago

Hi! When I reproduced the paper code, I hope to call the local gtr-t5-base model and the local gtrnq32 and gtrnq32__correct models. However, it is found that after running the following code, a parameter decoder_input_ids or decoder_inputs_embeds is missing during the intermediate parameter passing process. May I ask which part of the code should I modify? The vec2text model demo code is as follows:

import vec2text
import torch
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel

def get_gtr_embeddings(text_list,
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state

        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings

encoder = AutoModel.from_pretrained("../../vec2text/gtr-t5-base").encoder.to("cuda")

tokenizer = AutoTokenizer.from_pretrained("../../vec2text/gtr-t5-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

inversion_model = vec2text.models.InversionModel.from_pretrained('../../vec2text/gtr__nq__32', local_files_only=True).to_empty(device=device)
corrector_model = vec2text.models.CorrectorEncoderModel.from_pretrained('../../vec2text/gtr__nq__32', local_files_only=True).to_empty(device=device)

corrector = vec2text.load_corrector(inversion_model, corrector_model)

embeddings = get_gtr_embeddings([
       "Jack Morris is a PhD student at Cornell Tech in New York City",
       "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity"
], encoder, tokenizer)

print(embeddings)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

gtrnq32__correct config.json file configuration is as follows:

{
  "_frozen": true,
  "_n_gpu": 1,
  "adafactor": false,
  "adam_beta1": 0.9,
  "adam_beta2": 0.999,
  "adam_epsilon": 1e-06,
  "architectures": [
    "CorrectorEncoderModel"
  ],
  "auto_find_batch_size": false,
  "bf16": true,
  "bf16_full_eval": false,
  "cache_dir": null,
  "cheat_on_train_hypotheses": false,
  "config_name": null,
  "config_overrides": null,
  "corrector_ignore_hypothesis_embedding": false,
  "corrector_model_alias": null,
  "corrector_model_from_pretrained": "../gtr__nq__32",
  "data_seed": null,
  "dataloader_drop_last": false,
  "dataloader_num_workers": 0,
  "dataloader_pin_memory": true,
  "dataset_name": "nq",
  "ddp_backend": null,
  "ddp_broadcast_buffers": null,
  "ddp_bucket_cap_mb": null,
  "ddp_find_unused_parameters": null,
  "ddp_timeout": 1800,
  "debug": [],
  "decoder_dropout_disabled": false,
  "deepspeed": null,
  "deepspeed_plugin": null,
  "disable_tqdm": true,
  "dispatch_batches": null,
  "do_eval": false,
  "do_predict": false,
  "do_train": false,
  "embedder_fake_with_zeros": false,
  "embedder_model_api": null,
  "embedder_model_name": "/home/jyyang/vec2text/gtr-t5-base",
  "embedder_no_grad": true,
  "embedder_torch_dtype": "float32",
  "embedding_transform_strategy": "repeat",
  "embedding_zero_except_topk": null,
  "embeddings_from_layer_n": null,
  "encoder_dropout_disabled": false,
  "eval_accumulation_steps": null,
  "eval_delay": 0,
  "eval_steps": 6250,
  "evaluation_strategy": "steps",
  "exp_group_name": "gtr_corrector",
  "exp_name": "",
  "experiment": "corrector",
  "fp16": false,
  "fp16_backend": "auto",
  "fp16_full_eval": false,
  "fp16_opt_level": "O1",
  "freeze_strategy": "none",
  "fsdp": [],
  "fsdp_config": {
    "min_num_params": 0,
    "xla": false,
    "xla_fsdp_grad_ckpt": false
  },
  "fsdp_min_num_params": 0,
  "fsdp_transformer_layer_cls_to_wrap": null,
  "full_determinism": false,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "greater_is_better": false,
  "group_by_length": true,
  "half_precision_backend": "auto",
  "hub_always_push": false,
  "hub_model_id": null,
  "hub_private_repo": false,
  "hub_strategy": "every_save",
  "hub_token": null,
  "ignore_data_skip": false,
  "include_inputs_for_metrics": true,
  "include_tokens_per_second": false,
  "jit_mode_eval": false,
  "label_names": null,
  "label_smoothing_factor": 0.0,
  "learning_rate": 0.00282842712,
  "length_column_name": "length",
  "load_best_model_at_end": true,
  "local_rank": 0,
  "log_level": "passive",
  "log_level_replica": "warning",
  "log_on_each_node": true,
  "logging_dir": "/home/jyyang/vec2text/results/jy_test",
  "logging_first_step": false,
  "logging_nan_inf_filter": true,
  "logging_steps": 50,
  "logging_strategy": "steps",
  "lr_scheduler_type": "linear",
  "max_eval_samples": 500,
  "max_grad_norm": 1.0,
  "max_seq_length": 32,
  "max_steps": -1,
  "metric_for_best_model": "nq_loss",
  "mock_embedder": false,
  "model_name_or_path": "/home/jyyang/vec2text/gtr-t5-base",
  "model_revision": "main",
  "mp_parameters": "",
  "no_cuda": true,
  "num_repeat_tokens": 16,
  "num_train_epochs": 200.0,
  "optim": "adamw_torch",
  "optim_args": null,
  "output_dir": "/home/jyyang/vec2text/saves/jy_test",
  "overwrite_output_dir": false,
  "past_index": -1,
  "per_device_eval_batch_size": 256,
  "per_device_train_batch_size": 256,
  "per_gpu_eval_batch_size": null,
  "per_gpu_train_batch_size": null,
  "prediction_loss_only": false,
  "push_to_hub": false,
  "push_to_hub_model_id": null,
  "push_to_hub_organization": null,
  "push_to_hub_token": null,
  "ray_scope": "last",
  "remove_unused_columns": false,
  "report_to": [],
  "resume_from_checkpoint": null,
  "run_name": "/home/jyyang/vec2text/saves/jy_test",
  "save_on_each_node": false,
  "save_safetensors": false,
  "save_steps": 500,
  "save_strategy": "steps",
  "save_total_limit": 2,
  "seed": 42,
  "sharded_ddp": [],
  "skip_memory_metrics": true,
  "steps_per_epoch": 500000,
  "suffix_conditioning": false,
  "tf32": null,
  "tokenizer_name": null,
  "torch_compile": false,
  "torch_compile_backend": null,
  "torch_compile_mode": null,
  "torch_dtype": "float32",
  "torchdynamo": null,
  "tpu_metrics_debug": false,
  "tpu_num_cores": null,
  "transformers_version": "4.34.4",
  "use_cpu": false,
  "use_frozen_embeddings_as_input": false,
  "use_ipex": false,
  "use_legacy_prediction_loop": false,
  "use_less_data": -1,
  "use_lora": false,
  "use_mps_device": false,
  "use_wandb": false,
  "warmup_ratio": 0.0,
  "warmup_steps": 25000,
  "weight_decay": 0.0
}

gtrnq32 config.json file configuration is as follows:

{
  "_frozen": true,
  "_n_gpu": 1,
  "adafactor": false,
  "adam_beta1": 0.9,
  "adam_beta2": 0.999,
  "adam_epsilon": 1e-06,
  "architectures": [
    "InversionModel"
  ],
  "auto_find_batch_size": false,
  "bf16": true,
  "bf16_full_eval": false,
  "cache_dir": null,
  "cheat_on_train_hypotheses": false,
  "config_name": null,
  "config_overrides": null,
  "corrector_ignore_hypothesis_embedding": false,
  "corrector_model_alias": null,
  "corrector_model_from_pretrained": null,
  "data_seed": null,
  "dataloader_drop_last": false,
  "dataloader_num_workers": 0,
  "dataloader_pin_memory": true,
  "dataset_name": "nq",
  "ddp_backend": null,
  "ddp_broadcast_buffers": null,
  "ddp_bucket_cap_mb": null,
  "ddp_find_unused_parameters": null,
  "ddp_timeout": 1800,
  "debug": [],
  "decoder_dropout_disabled": false,
  "deepspeed": null,
  "deepspeed_plugin": null,
  "disable_tqdm": true,
  "dispatch_batches": null,
  "do_eval": false,
  "do_predict": false,
  "do_train": false,
  "embedder_fake_with_zeros": false,
  "embedder_model_api": null,
  "embedder_model_name": "/home/jyyang/vec2text/gtr-t5-base",
  "embedder_no_grad": true,
  "embedder_torch_dtype": "float32",
  "embedding_transform_strategy": "repeat",
  "embedding_zero_except_topk": null,
  "embeddings_from_layer_n": null,
  "encoder_dropout_disabled": false,
  "eval_accumulation_steps": null,
  "eval_delay": 0,
  "eval_steps": 1250,
  "evaluation_strategy": "steps",
  "exp_group_name": "oct-gtr32",
  "exp_name": "",
  "experiment": "inversion",
  "fp16": false,
  "fp16_backend": "auto",
  "fp16_full_eval": false,
  "fp16_opt_level": "O1",
  "freeze_strategy": "none",
  "fsdp": [],
  "fsdp_config": {
    "min_num_params": 0,
    "xla": false,
    "xla_fsdp_grad_ckpt": false
  },
  "fsdp_min_num_params": 0,
  "fsdp_transformer_layer_cls_to_wrap": null,
  "full_determinism": false,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "greater_is_better": false,
  "group_by_length": true,
  "half_precision_backend": "auto",
  "hub_always_push": false,
  "hub_model_id": null,
  "hub_private_repo": false,
  "hub_strategy": "every_save",
  "hub_token": null,
  "ignore_data_skip": false,
  "include_inputs_for_metrics": true,
  "include_tokens_per_second": false,
  "jit_mode_eval": false,
  "label_names": null,
  "label_smoothing_factor": 0.0,
  "learning_rate": 0.002,
  "length_column_name": "length",
  "load_best_model_at_end": true,
  "local_rank": 0,
  "log_level": "passive",
  "log_level_replica": "warning",
  "log_on_each_node": true,
  "logging_dir": "/home/jyyang/vec2text/results/jy_test",
  "logging_first_step": false,
  "logging_nan_inf_filter": true,
  "logging_steps": 25,
  "logging_strategy": "steps",
  "lr_scheduler_type": "constant_with_warmup",
  "max_eval_samples": 500,
  "max_grad_norm": 1.0,
  "max_seq_length": 32,
  "max_steps": -1,
  "metric_for_best_model": "nq_loss",
  "mock_embedder": false,
  "model_name_or_path": "/home/jyyang/vec2text/gtr-t5-base",
  "model_revision": "main",
  "mp_parameters": "",
  "no_cuda": true,
  "num_repeat_tokens": 16,
  "num_train_epochs": 300.0,
  "optim": "adamw_torch",
  "optim_args": null,
  "output_dir": "/home/jyyang/vec2text/saves/jy_test",
  "overwrite_output_dir": false,
  "past_index": -1,
  "per_device_eval_batch_size": 256,
  "per_device_train_batch_size": 512,
  "per_gpu_eval_batch_size": null,
  "per_gpu_train_batch_size": null,
  "prediction_loss_only": false,
  "push_to_hub": false,
  "push_to_hub_model_id": null,
  "push_to_hub_organization": null,
  "push_to_hub_token": null,
  "ray_scope": "last",
  "remove_unused_columns": false,
  "report_to": [],
  "resume_from_checkpoint": null,
  "run_name": "/home/jyyang/vec2text/saves/jy_test",
  "save_on_each_node": false,
  "save_safetensors": false,
  "save_steps": 125,
  "save_strategy": "steps",
  "save_total_limit": 2,
  "seed": 42,
  "sharded_ddp": [],
  "skip_memory_metrics": true,
  "steps_per_epoch": 500000,
  "suffix_conditioning": false,
  "tf32": null,
  "tokenizer_name": null,
  "torch_compile": false,
  "torch_compile_backend": null,
  "torch_compile_mode": null,
  "torch_dtype": "float32",
  "torchdynamo": null,
  "tpu_metrics_debug": false,
  "tpu_num_cores": null,
  "transformers_version": "4.34.4",
  "use_cpu": false,
  "use_frozen_embeddings_as_input": true,
  "use_ipex": false,
  "use_legacy_prediction_loop": false,
  "use_less_data": -1,
  "use_lora": false,
  "use_mps_device": false,
  "use_wandb": false,
  "warmup_ratio": 0.0,
  "warmup_steps": 625,
  "weight_decay": 0.0
}
jxmorris12 commented 1 week ago

It's not clear what error you're running into because you didn't post a stack trace. However, I do have a section of the readme about how to evaluate the models from the paper, which recommends running this code:

from vec2text import analyze_utils

experiment, trainer = analyze_utils.load_experiment_and_trainer_from_pretrained(
     "jxm/gtr__nq__32__correct"
)
train_datasets = experiment._load_train_dataset_uncached(
    model=trainer.model,
    tokenizer=trainer.tokenizer,
    embedder_tokenizer=trainer.embedder_tokenizer
)

val_datasets = experiment._load_val_datasets_uncached(
    model=trainer.model,
    tokenizer=trainer.tokenizer,
    embedder_tokenizer=trainer.embedder_tokenizer
)
trainer.args.per_device_eval_batch_size = 16
trainer.sequence_beam_width = 1
trainer.num_gen_recursive_steps = 20
trainer.evaluate(
    eval_dataset=train_datasets["validation"]
)

which worked for me last time I tried it. Can you try that?

jyyang26 commented 1 week ago

Thanks for your timely reply. After running the evaluation code you gave me, I found the following errors, which seemed to be errors when loading the data set. I searched for the possible cause and found that the problem was in the installation package, and the problem still existed after I updated the installation package. The display may also be a problem with the data set, do you know how to solve it

(base) [jyyang@hostname tests]$ python invert_embed_jx.py 
/home/jyyang/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of šŸ¤— Transformers. Use `eval_strategy` instead
  warnings.warn(
Set num workers to 4
Experiment output_dir = saves/jxm__gtr__nq__32__correct
/home/jyyang/anaconda3/lib/python3.12/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of šŸ¤— Transformers. Use `eval_strategy` instead
  warnings.warn(
Set num workers to 4
Experiment output_dir = saves/jxm__gtr__nq__32
Loading datasets with TOKENIZERS_PARALLELISM = False
>> using fast tokenizers: True True
Running tokenizer on dataset (num_proc=48): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:09<00:00, 104.06 examples/s]
Running tokenizer on dataset (num_proc=48): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:09<00:00, 107.09 examples/s]
Running tokenizer on dataset (num_proc=48): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:09<00:00, 107.33 examples/s]
[Precomputing embeddings with batch size: 512]
        saving precomputed embeddings to file: 5c812bc2a204dfaf5e45ee728663e5f9572d52fc0eb17707
Map: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:01<00:00, 766.54 examples/s]
        saving precomputed embeddings to file: f9a661eacda517bf5e45ee728663e5f9572d52fc0eb17707
Map: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:00<00:00, 1466.80 examples/s]
        saving precomputed embeddings to file: f9a661eacda517bf5e45ee728663e5f9572d52fc0eb17707
Map: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:00<00:00, 1483.48 examples/s]
saving train_dataset to path: /home/jyyang/.cache/inversion/dd0d97ad14fd6897b0d31cecc2e14d13.arrow
Saving the dataset (1/1 shards): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:00<00:00, 98298.62 examples/s]
Saving the dataset (1/1 shards): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 1000/1000 [00:00<00:00, 55877.86 examples/s]
Saving the dataset (1/1 shards): 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 500/500 [00:00<00:00, 38562.64 examples/s]
Traceback (most recent call last):
  File "/home/jyyang/vec2text-master/tests/invert_embed_jx.py", line 3, in <module>
    experiment, trainer = analyze_utils.load_experiment_and_trainer_from_pretrained(
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/analyze_utils.py", line 172, in load_experiment_and_trainer_from_pretrained
    trainer = experiment.load_trainer()
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/experiments.py", line 759, in load_trainer
    ) = vec2text.analyze_utils.load_experiment_and_trainer_from_pretrained(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/analyze_utils.py", line 172, in load_experiment_and_trainer_from_pretrained
    trainer = experiment.load_trainer()
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/experiments.py", line 631, in load_trainer
    train_dataset, eval_dataset = self.load_train_and_val_datasets(
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/experiments.py", line 595, in load_train_and_val_datasets
    val_datasets_dict = self._load_val_datasets_uncached(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/experiments.py", line 518, in _load_val_datasets_uncached
    val_datasets_dict = load_standard_val_datasets()
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/data_helpers.py", line 251, in load_standard_val_datasets
    "wikibio": load_wikibio_val(),
               ^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/vec2text/data_helpers.py", line 131, in load_wikibio_val
    d = datasets.load_dataset("wiki_bio")["val"]
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/datasets/load.py", line 2606, in load_dataset
    builder_instance = load_dataset_builder(
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/datasets/load.py", line 2277, in load_dataset_builder
    dataset_module = dataset_module_factory(
                     ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/datasets/load.py", line 1923, in dataset_module_factory
    raise e1 from None
  File "/home/jyyang/anaconda3/lib/python3.12/site-packages/datasets/load.py", line 1875, in dataset_module_factory
    can_load_config_from_parquet_export = "DEFAULT_CONFIG_NAME" not in f.read()
                                                                       ^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 1: invalid start byte
(base) [jyyang@hostname tests]$ 
jxmorris12 commented 1 week ago

I'm not sure what the problem is. I'm guessing you ran the command multiple times, and the first run(s) corrupted the download file of Wikibio. I just tested this locally and it works fine for me:

>>> import datasets
>>> datasets.load_dataset("wiki_bio")
DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 582659
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 72831
    })
    val: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 72831
    })
})

That said, this validation set isn't important. You can just comment this one out (line 251 of data_helpers.py) to get around the problem.