huggingface / peft

🤗 PEFT: State-of-the-art Parameter-Efficient Fine-Tuning.
https://huggingface.co/docs/peft
Apache License 2.0
16.36k stars 1.61k forks source link

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn #1980

Closed jjzhu0579 closed 1 month ago

jjzhu0579 commented 3 months ago

System Info

python 3.10 transformer 4.43

Who can help?

No response

Information

Tasks

Reproduction

import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType import numpy as np import matplotlib.pyplot as plt import os from transformers import AutoModel from transformers.modeling_outputs import CausalLMOutputWithPast from typing import Optional, Tuple, Union, List, Callable, Dict, Any from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss def custom_forward( self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Tuple[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, return_last_logit: Optional[bool] = False, ): use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict

transformer_outputs = self.transformer(
    input_ids=input_ids,
    position_ids=position_ids,
    attention_mask=attention_mask,
    past_key_values=past_key_values,
    inputs_embeds=inputs_embeds,
    use_cache=use_cache,
    output_hidden_states=output_hidden_states,
    return_dict=return_dict,
)

hidden_states = transformer_outputs[0]
if return_last_logit:
    hidden_states = hidden_states[:, -1:]
lm_logits = self.transformer.output_layer(hidden_states)

loss = None
if labels is not None:
    lm_logits = lm_logits.to(torch.float32)

    # Shift so that tokens < n predict n
    shift_logits = lm_logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    lm_logits = lm_logits.to(hidden_states.dtype)
    loss = loss.to(hidden_states.dtype)

if not return_dict:
    output = (lm_logits,) + transformer_outputs[1:]
    return ((loss,) + output) if loss is not None else output

return CausalLMOutputWithPast(
    loss=loss,
    logits=lm_logits,
    past_key_values=transformer_outputs.past_key_values,
    hidden_states=transformer_outputs.hidden_states,
    attentions=transformer_outputs.attentions,
)

启用离线模式

os.environ["TRANSFORMERS_OFFLINE"] = "1"

读取训练数据

with open('./final_train.txt', 'r') as file: train_data = file.readlines() train_texts = [] train_labels = [] invalid_lines_count = 0

for line in train_data: if line.strip(): parts = line.strip().split("\t") if len(parts) == 2: word, label = parts if len(word) == 1 and not word.isalnum(): train_texts.append(word) train_labels.append("O") else: train_texts.append(word) train_labels.append(label) else: invalid_lines_count += 1

print(f"Number of invalid lines: {invalid_lines_count}")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') tokenizer = AutoTokenizer.from_pretrained('/data/aim_nuist/aim_zhujj/xinjian/glm4_lora/ZhipuAI/glm-4-9b-chat', trust_remote_code=True) base_model = AutoModelForCausalLM.from_pretrained( '/data/aim_nuist/aim_zhujj/xinjian/glm4_lora/ZhipuAI/glm-4-9b-chat', low_cpu_mem_usage=True, trust_remote_code=True, ).to(device).eval()

config = PromptEncoderConfig( task_type=TaskType.TOKEN_CLS, num_virtual_tokens=10, encoder_reparameterization_type=PromptEncoderReparameterizationType.MLP, encoder_dropout=0.1, encoder_num_layers=4, encoder_hidden_size=4096) model = get_peft_model(base_model, config) model.forward = custom_forward.get(model)

构建微调任务

train_texts = ['''Generate BIO tags for each word in the given paragraph,. The BIO format uses the following labels: • B: Beginning of an entity • I: Inside of an entity • O: Outside of an entity Please extract all chemicals, genes, and diseases mentioned in the paragraph. Provide the output in the format - , where each word is followed by its corresponding BIO tag. ''' + text for text in train_texts]

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt", max_length=256) train_labels_encodings = tokenizer(train_labels, truncation=True, padding=True, return_tensors="pt", max_length=256)

class Dataset(torch.utils.data.Dataset): def init(self, encodings, labels_encodings): self.encodings = encodings self.labels_encodings = labels_encodings

def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels_encodings['input_ids'][idx])
    return item

def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings, train_labels_encodings) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4) model.to(device)

epochs = 1 train_losses = [] for epoch in range(epochs): model.train() epoch_loss = 0 for batch in train_loader: optimizer.zero_grad() input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) max_seq_length = input_ids.shape[1] padded_labels = torch.nn.functional.pad(labels, (0, max_seq_length - labels.shape[1]), value=-100).to(device)

    # Debugging outputs
    print(f"Batch size: {input_ids.size(0)}")
    print(f"Max sequence length: {max_seq_length}")
    print(f"Input IDs: {input_ids}")
    print(f"Attention Mask: {attention_mask}")
    print(f"Padded Labels: {padded_labels}")

    # Check for None
    if input_ids is None:
        raise ValueError("input_ids is None")
    if attention_mask is None:
        raise ValueError("attention_mask is None")
    if padded_labels is None:
        raise ValueError("padded_labels is None")

    # Check types
    print(f"Type of input_ids: {type(input_ids)}")
    print(f"Type of attention_mask: {type(attention_mask)}")
    print(f"Type of padded_labels: {type(padded_labels)}")

    # Check shapes
    print(f"Shape of input_ids: {input_ids.shape}")
    print(f"Shape of attention_mask: {attention_mask.shape}")
    print(f"Shape of padded_labels: {padded_labels.shape}")

    # Forward pass
    try:
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=padded_labels)
        loss = outputs.loss
        print(f"Loss: {loss.item()}")
    except Exception as e:
        print(f"Error during model forward pass: {e}")
        print(f"input_ids: {input_ids}")
        print(f"attention_mask: {attention_mask}")
        print(f"padded_labels: {padded_labels}")

        # Inspect model layers
        for name, param in model.named_parameters():
            if param is None:
                print(f"Layer {name} has None as its parameter.")

        # Inspect outputs
        if 'outputs' in locals():
            print(f"Outputs: {outputs}")
            if outputs is not None:
                print(f"Outputs type: {type(outputs)}")
                if hasattr(outputs, 'loss'):
                    print(f"Outputs.loss shape: {outputs.loss.shape}")
        else:
            print("Outputs are not defined")

        raise
    for param in model.parameters():
        param.requires_grad = True

    if loss.requires_grad:
        loss.backward()
    else:
        print("Loss does not require gradients")
    optimizer.step()
    epoch_loss += loss.item()
train_losses.append(epoch_loss / len(train_loader))

plt.plot(np.arange(1, epochs + 1), train_losses, label="Training Loss") plt.xlabel("Epochs") plt.ylabel("Loss") plt.title("Training Loss Curve") plt.legend() plt.savefig("training_loss_curve.png")

torch.save(model.state_dict(), "/data/aim_nuist/aim_zhujj/xinjian/glm_bc2_pt_model.pt")

Expected behavior

Traceback (most recent call last): File "/share/home/aim/aim_zhujj/bc2/glm_bc2_pt.py", line 210, in loss.backward() File "/data/aim_nuist/aim_zhujj/.conda/envs/pytorch/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward torch.autograd.backward( File "/data/aim_nuist/aim_zhujj/.conda/envs/pytorch/lib/python3.10/site-packages/torch/autograd/init.py", line 267, in backward _engine_run_backward( File "/data/aim_nuist/aim_zhujj/.conda/envs/pytorch/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

BenjaminBossan commented 3 months ago

See #1974.

github-actions[bot] commented 1 month ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.