googlecolab / colabtools

Python libraries for Google Colaboratory
Apache License 2.0
2.19k stars 720 forks source link

I cannot install accelerate nor sentencepiece, i don't want to help since you waisted my time and you will waste thousands people time. #3982

Closed sweterek420 closed 1 year ago

sweterek420 commented 1 year ago

Describe the current behavior A clear and concise explanation of what is currently happening.

Describe the expected behavior A clear and concise explanation of what you expected to happen.

What web browser you are using (Chrome, Firefox, Safari, etc.)

Additional context Link to a minimal, public, self-contained notebook that reproduces this issue.

sweterek420 commented 1 year ago

!pip install -q -U bitsandbytes !pip install -q -U sentencepiece !pip install -q sentencepiece !pip install -U sentencepiece !pip install -q -U transformers !pip install -q -U peft !pip install -q -U accelerate !pip install -q -U datasets !pip install -q -U trl !pip install -q -U einops

Replace HF_TOKEN by your Hugging Face Token

Don't change hf_user

def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True): loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" for name, param in model.named_parameters(): param.requires_grad = False

if not is_gptq_quantized:
    for param in model.parameters():
        if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
            param.data = param.data.to(torch.float32)

if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)

        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

    model.gradient_checkpointing_enable()

return model

import torch from datasets import load_dataset from peft import LoraConfig, PeftModel from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, GenerationConfig ) from peft.tuners.lora import LoraLayer

from trl import SFTTrainer

model_name = "openlm-research/open_llama_3b_v2" # Change to the 3 billion parameter model tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokenizer.add_special_tokens({"pad_token":""}) tokenizer.padding_side = 'left'

dataset = load_dataset("timdettmers/openassistant-guanaco")

compute_dtype = getattr(torch, "float16") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map={"": 0} )

model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id model.config.use_cache = False

model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( lora_alpha=32, lora_dropout=0.1, r=8, bias="none", task_type="CAUSAL_LM", target_modules= ["q_proj","v_proj"] )

training_arguments = TrainingArguments( output_dir="./results", evaluation_strategy="steps", do_eval=True, per_device_train_batch_size=4, gradient_accumulation_steps=1, per_device_eval_batch_size=4, log_level="debug", optim="paged_adamw_32bit", save_steps=2, logging_steps=1, learning_rate=1e-4, eval_steps=5, fp16=True, max_grad_norm=0.3, warmup_ratio=0.03, lr_scheduler_type="constant", )

trainer = SFTTrainer( model=model, train_dataset=dataset['train'], eval_dataset=dataset['test'], peft_config=peft_config, dataset_text_field="text", max_seq_length=512, tokenizer=tokenizer, args=training_arguments, )

trainer.train()

model.save_pretrained("./content/gptx") # Save the model

sweterek420 commented 1 year ago

Dunno if it's kind of temp fix but: The transformers library depends on sentencepiece, and sometimes the order of installation can affect whether dependencies are correctly recognized. By installing sentencepiece before transformers, i ensured that sentencepiece was available when transformers was installed.

cperry-goog commented 1 year ago

I can't repo any errors here in trying all of those pip installs.

One thing you might attempt is bringing those onto the same line and letting pip manage when to install what:

!pip install -q -U bitsandbytes sentencepiece sentencepiece sentencepiece transformers peft accelerate datasets trl einops

sweterek420 commented 1 year ago

Thanks, your one line with 3x sentencepiece mastermindly solved my issue. Now i'm fighting with ValueError: text input must of type str (single example), List[str] (batch or single pretokenized example) or List[List[str]] (batch of pretokenized examples).

`import torch from datasets import load_dataset, Features, Value, Dataset from peft import LoraConfig, PeftModel import bitsandbytes as bnb from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig ) from peft.tuners.lora import LoraLayer from trl import SFTTrainer

Replace HF_TOKEN by your Hugging Face Token

Don't change hf_user

def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True): loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" for name, param in model.named_parameters(): param.requires_grad = False

if not is_gptq_quantized:
    for param in model.parameters():
        if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
            param.data = param.data.to(torch.float32)

if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    else:
        def make_inputs_require_grad(module, input, output):
            output.requires_grad_(True)

        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

    model.gradient_checkpointing_enable()

return model

model_name = "gpt2-large" # Change to the 3 billion parameter model tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokenizer.add_special_tokens({"pad_token":"", "additional_special_tokens": ["[START]", "[END]"]}) tokenizer.padding_side = 'left'

def tokenization(batch): encoding = tokenizer([str(text) for text in batch["text"]], padding='max_length', truncation=True, max_length=512) encoding['labels'] = encoding['input_ids'].clone() return encoding

def read_lua_code_dataset(file_path): with open(file_path, 'r') as f: content = f.read() examples = content.split('[END]') examples = [example.replace('[START]', '').strip() for example in examples if example.strip()] return examples

examples = read_lua_code_dataset('./lua/lua_code_dataset.txt') dataset = Dataset.from_dict({'text': examples}) dataset = dataset.map(tokenization, batched=True, remove_columns=["text"]) dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

compute_dtype = getattr(torch, "float16")

model = AutoModelForCausalLM.from_pretrained( model_name, device_map={"": 0} ) print(model) model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id model.config.use_cache = False

model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( lora_alpha=32, lora_dropout=0.1, r=8, bias="none", task_type="CAUSAL_LM", target_modules= ["c_attn","c_proj"] )

training_arguments = TrainingArguments( output_dir="./results", evaluation_strategy="steps", do_eval=True, per_device_train_batch_size=32, gradient_accumulation_steps=1, per_device_eval_batch_size=32, log_level="debug", optim="paged_adamw_32bit", save_steps=2, logging_steps=1, learning_rate=2e-4, eval_steps=5, fp16=True, max_grad_norm=0.1, warmup_ratio=0.03, lr_scheduler_type="constant", )

dataset = dataset.train_test_split(test_size=0.2)

trainer = SFTTrainer( model=model, train_dataset=dataset['train'], eval_dataset=dataset['test'], peft_config=peft_config, dataset_text_field="input_ids", max_seq_length=512, tokenizer=tokenizer, args=training_arguments, )

trainer.train()

model.save_pretrained("./content/gpt-l-2")`

metrizable commented 1 year ago

@sweterek420 Thanks for the reply. From the error text you shared, it appears some input is not of the correct type (e.g. something needs to be of type str, List[str], or List[List[str]], but it's not).

This appears to be an issue outside of Colab's control. We recommend asking on Stack Overflow or reaching out to the notebook author.