RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

keviddles commented 7 months ago

I'm running through the emotion.ipynb notebook, running on the CPU.

At cell

model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
)

I see:

  0%|          | 0/234 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[7], line 2
      1 model.reset() # make sure you always reset the model before training a new vector
----> 2 control_vector = ControlVector.train(
      3     model,
      4     tokenizer,
      5     dataset,
      6 )

File /notebooks/code/repeng/notebooks/../repeng/extract.py:34, in ControlVector.train(cls, model, tokenizer, dataset, **kwargs)
     26 @classmethod
     27 def train(
     28     cls,
   (...)
     32     **kwargs,
     33 ) -> "ControlVector":
---> 34     dirs = read_representations(
     35         model,
     36         tokenizer,
     37         dataset,
     38         **kwargs,
     39     )
     40     return cls(model_type=model.config.model_type, directions=dirs)

File /notebooks/code/repeng/notebooks/../repeng/extract.py:139, in read_representations(model, tokenizer, inputs, hidden_layers, batch_size)
    136 # the order is [positive, negative, positive, negative, ...]
    137 train_strs = [s for ex in inputs for s in (ex.positive, ex.negative)]
--> 139 layer_hiddens = batched_get_hiddens(
    140     model, tokenizer, train_strs, hidden_layers, batch_size
    141 )
    143 # get differences between (positive, negative) pairs
    144 relative_layer_hiddens = {}

File /notebooks/code/repeng/notebooks/../repeng/extract.py:208, in batched_get_hiddens(model, tokenizer, inputs, hidden_layers, batch_size)
    206 with torch.no_grad():
    207     for batch in tqdm.tqdm(batched_inputs):
--> 208         out = model(
    209             **tokenizer(batch, padding=True, return_tensors="pt").to(model.device),
    210             output_hidden_states=True,
    211         )
    212         for layer in hidden_layers:
    213             # if not indexing from end, account for embedding hiddens
    214             hidden_idx = layer + 1 if layer >= 0 else layer

File /notebooks/code/repeng/notebooks/../repeng/control.py:123, in ControlModel.__call__(self, *args, **kwargs)
    122 def __call__(self, *args, **kwargs):
--> 123     return self.model(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1157, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1154 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1156 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1157 outputs = self.model(
   1158     input_ids=input_ids,
   1159     attention_mask=attention_mask,
   1160     position_ids=position_ids,
   1161     past_key_values=past_key_values,
   1162     inputs_embeds=inputs_embeds,
   1163     use_cache=use_cache,
   1164     output_attentions=output_attentions,
   1165     output_hidden_states=output_hidden_states,
   1166     return_dict=return_dict,
   1167 )
   1169 hidden_states = outputs[0]
   1170 logits = self.lm_head(hidden_states)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:1042, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
   1032     layer_outputs = self._gradient_checkpointing_func(
   1033         decoder_layer.__call__,
   1034         hidden_states,
   (...)
   1039         use_cache,
   1040     )
   1041 else:
-> 1042     layer_outputs = decoder_layer(
   1043         hidden_states,
   1044         attention_mask=attention_mask,
   1045         position_ids=position_ids,
   1046         past_key_value=past_key_values,
   1047         output_attentions=output_attentions,
   1048         use_cache=use_cache,
   1049     )
   1051 hidden_states = layer_outputs[0]
   1053 if use_cache:

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:757, in MistralDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
    754 hidden_states = self.input_layernorm(hidden_states)
    756 # Self Attention
--> 757 hidden_states, self_attn_weights, present_key_value = self.self_attn(
    758     hidden_states=hidden_states,
    759     attention_mask=attention_mask,
    760     position_ids=position_ids,
    761     past_key_value=past_key_value,
    762     output_attentions=output_attentions,
    763     use_cache=use_cache,
    764 )
    765 hidden_states = residual + hidden_states
    767 # Fully Connected

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/transformers/models/mistral/modeling_mistral.py:257, in MistralAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
    252     warnings.warn(
    253         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
    254     )
    255 bsz, q_len, _ = hidden_states.size()
--> 257 query_states = self.q_proj(hidden_states)
    258 key_states = self.k_proj(hidden_states)
    259 value_states = self.v_proj(hidden_states)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
    113 def forward(self, input: Tensor) -> Tensor:
--> 114     return F.linear(input, self.weight, self.bias)

RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

Some light googling indicates it may be related to running on CPU and using float16s but I've no idea where I'd update this.

vgel commented 6 months ago

On this line of the notebook:

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

Remove the torch_dtype=torch.float16 part:

model = AutoModelForCausalLM.from_pretrained(model_name)

However, training on CPU is going to be very slow most likely unless you're working with a small model (like GPT-2-small or something). If you're using a modern model, I'd recommend using a cloud GPU service to train the vector, then exporting it to a .gguf with the export_gguf method and using it locally with a quantized model via llama.cpp (see https://github.com/ggerganov/llama.cpp/pull/5970). You can use any cloud GPU service like Colab Pro or Runpod, I use Runpod personally, renting a 3090 from them is $0.44/hr, so it shouldn't cost more than a couple dollars to train as many vectors as you need.

keviddles commented 6 months ago

Appreciate the response and the feedback, @vgel ! Cheers.

vgel / repeng

RuntimeError: "addmm_impl_cpu_" not implemented for 'Half' #30