ridgerchu / matmulfreellm

Implementation for MatMul-free LM.
Apache License 2.0
2.5k stars 139 forks source link

tried to train #1

Closed thistleknot closed 1 week ago

thistleknot commented 3 weeks ago

triton 2.2.0 torch 2.2.0 einops 0.7.0 compute compatibility 6.0 rocky linux 9 cuda 12.2 python 3.10

setup


# Disable parallelism for tokenizers to avoid potential issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Specify the model name (replace '' with your actual model name)
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).cuda().half()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# Initialize Trainer with model, tokenizer, and training arguments
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments
    tokenizer=tokenizer,                 # tokenizer
    train_dataset=your_train_dataset,    # replace with your training dataset
    eval_dataset=your_eval_dataset       # replace with your evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print(results)

# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

error

---------------------------------------------------------------------------
HFValidationError                         Traceback (most recent call last)
File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/utils/hub.py:398, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
    396 try:
    397     # Load from URL or cache if already cached
--> 398     resolved_file = hf_hub_download(
    399         path_or_repo_id,
    400         filename,
    401         subfolder=None if len(subfolder) == 0 else subfolder,
    402         repo_type=repo_type,
    403         revision=revision,
    404         cache_dir=cache_dir,
    405         user_agent=user_agent,
    406         force_download=force_download,
    407         proxies=proxies,
    408         resume_download=resume_download,
    409         token=token,
    410         local_files_only=local_files_only,
    411     )
    412 except GatedRepoError as e:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:110, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    109 if arg_name in ["repo_id", "from_id", "to_id"]:
--> 110     validate_repo_id(arg_value)
    112 elif arg_name == "token" and arg_value is not None:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:164, in validate_repo_id(repo_id)
    163 if not REPO_ID_REGEX.match(repo_id):
--> 164     raise HFValidationError(
    165         "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
    166         " forbidden, '-' and '.' cannot start or end the name, max length is 96:"
    167         f" '{repo_id}'."
    168     )
    170 if "--" in repo_id or ".." in repo_id:

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.

The above exception was the direct cause of the following exception:

OSError                                   Traceback (most recent call last)
Cell In[37], line 1
----> 1 trainer = SFTTrainer(
      2     model,
      3     train_dataset=dataset_dict['train'],
      4     eval_dataset=dataset_dict['validation'],
      5     args=training_args,
      6 )
      8 trainer.train()

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:238, in __init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics, peft_config, dataset_text_field, packing, formatting_func, max_seq_length, infinite, num_of_sequences, chars_per_token, dataset_num_proc, dataset_batch_size, neftune_noise_alpha, model_init_kwargs, dataset_kwargs, eval_packing)
    232         args = dataclasses.replace(args, gradient_checkpointing=False)
    233 elif getattr(args, "gradient_checkpointing", False) and (
    234     "use_reentrant" not in gradient_checkpointing_kwargs
    235     or gradient_checkpointing_kwargs["use_reentrant"]
    236 ):
    237     # For backward compatibility with older versions of transformers
--> 238     if hasattr(model, "enable_input_require_grads"):
    239         model.enable_input_require_grads()
    240     else:

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:804, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
    801     return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    803 # Next, let's try to use the tokenizer_config file to get the tokenizer class.
--> 804 tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
    805 if "_commit_hash" in tokenizer_config:
    806     kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:637, in get_tokenizer_config(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, **kwargs)
    634     token = use_auth_token
    636 commit_hash = kwargs.get("_commit_hash", None)
--> 637 resolved_config_file = cached_file(
    638     pretrained_model_name_or_path,
    639     TOKENIZER_CONFIG_FILE,
    640     cache_dir=cache_dir,
    641     force_download=force_download,
    642     resume_download=resume_download,
    643     proxies=proxies,
    644     token=token,
    645     revision=revision,
    646     local_files_only=local_files_only,
    647     subfolder=subfolder,
    648     _raise_exceptions_for_gated_repo=False,
    649     _raise_exceptions_for_missing_entries=False,
    650     _raise_exceptions_for_connection_errors=False,
    651     _commit_hash=commit_hash,
    652 )
    653 if resolved_config_file is None:
    654     logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")

File /home/user/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/utils/hub.py:462, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)
    460     raise EnvironmentError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}")
    461 except HFValidationError as e:
--> 462     raise EnvironmentError(
    463         f"Incorrect path_or_model_id: '{path_or_repo_id}'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
    464     ) from e
    465 return resolved_file

OSError: Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub.
ridgerchu commented 3 weeks ago

Hi, I found that seems you are using the Tinyllama for training, instead of our model...

# Specify the model name (replace '' with your actual model name)
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
thistleknot commented 2 weeks ago

Ye I don't understand. Am I to use literally '' or what is the model name? My understanding is you show how to specify the model using the config, but not any tokenizer nor model name

The instructions have me guessing I was to pick a tokenizer (similar to mamba).

Can you provide a complete working example or tell me what would work in place of '' (or is it just ''?)

On Fri, Jun 7, 2024, 11:11 PM Ridger Zhu @.***> wrote:

Hi, I found that seems you are using the Tinyllama for training, instead of our model...

Specify the model name (replace '' with your actual model name)model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

— Reply to this email directly, view it on GitHub https://github.com/ridgerchu/matmulfreellm/issues/1#issuecomment-2155830487, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABHKKOTEC5VHQCNRBQSXNNLZGKN7TAVCNFSM6AAAAABI7SRL42VHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDCNJVHAZTANBYG4 . You are receiving this because you authored the thread.Message ID: @.***>

thistleknot commented 2 weeks ago

I re reviewed the Readme Saw a models link https://huggingface.co/ridger/MMfreeLM-2.7B Will try those

thistleknot commented 2 weeks ago

Revised code

from mmfreelm.models import HGRNBitConfig
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

os.environ["WANDB_MODE"]="offline"

model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = AutoModel.from_pretrained(model_name_or_path)

batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token

# Load the dataset
dataset = load_dataset("Abirate/english_quotes")

# Filter the quotes based on length
filtered_quotes = [q for q in dataset['train'] if 23 < len(q['quote']) < 140]

# Create a new dataset with the filtered quotes
filtered_dataset = Dataset.from_dict({'quote': [q['quote'] for q in filtered_quotes]})
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['quote'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)

# Split the dataset into training and validation
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=42)
dataset_dict = DatasetDict(train=train_test_split['train'], validation=train_test_split['test'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=1,
    gradient_accumulation_steps=1,
    learning_rate=0.001,
    warmup_ratio=0.1,
    logging_dir='./logs',
    logging_steps=1
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

but kernel bombs out

ridgerchu commented 2 weeks ago

Hi, does it work for inferencing instead of training? and do you use NVIDIA GPU for training?

thistleknot commented 2 weeks ago

that's a great troubleshooting step

First error I made was AutoModel, and changed it to the custom class definition

then tried inference

from mmfreelm.models import HGRNBitConfig, HGRNBitForCausalLM
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

os.environ["WANDB_MODE"]="offline"

model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = HGRNBitForCausalLM.from_pretrained(model_name_or_path)
model.to('cuda')
batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token

# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
from mmfreelm.models import HGRNBitConfig, HGRNBitForCausalLM
from transformers import AutoModel
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from trl import SFTTrainer
import os
import mmfreelm
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

os.environ["WANDB_MODE"]="offline"

model_name_or_path = "ridger/MMfreeLM-2.7B"

#config = 'ridger/MMfreeLM-2.7B'#HGRNBitConfig()
model = HGRNBitForCausalLM.from_pretrained(model_name_or_path)
model.to('cuda')
batch_size=6

training_args = TrainingArguments(
    #eval_strategy='steps',
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    eval_steps=1,
    gradient_accumulation_steps=1,
    optim='paged_lion_8bit',
    gradient_checkpointing_kwargs={"use_reentrant": False},
    #0.000003, starts at 6
    #0.00003, #start at 2.6
    learning_rate=0.001,
    warmup_ratio=.1,
    adam_beta2=0.95,
    adam_epsilon=0.00001,
    max_grad_norm=1.0,
    lr_scheduler_type='cosine',
    logging_dir='./logs',
    logging_steps=1,
    do_train=True
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path, model_max_length=2048,
    padding_side="right", use_fast=False, add_eos_token=True, add_bos_token=True)

tokenizer.pad_token = tokenizer.eos_token

# Generate text using the model
input_prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids, max_length=32, do_sample=True, top_p=0.4, temperature=0.6)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

results in kernel quitting (no console output).

nvidia yes (cuda 12.2, python 3.10)

ridgerchu commented 2 weeks ago

Hmmm do you installed the triton==2.2?

thistleknot commented 2 weeks ago

Yes I updated the first post w my env

ridgerchu commented 2 weeks ago

Thanks! I see. I check your envs, and finds your compute compatibility is 6.0, indicating you are using Pascal arch. The Pascal arch is may not support well to triton, which may lead to this problem I guess... I use our A100 and H100 to test our code, which works well...