Closed ujjawal-ti closed 6 months ago
Hi @ujjawal-ti ,
Could you specify which teleprompter you are using to compile, and also provide the full stack trace?
Hey @arnavsinghvi11 ,
I tried BootstrapFewShot
and BootstrapFinetune
(target modelt5-large
)
teleprompter = BootstrapFinetune(metric=metric_EM)
Please find the full stack trace below,
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[20], line 56
54 #teleprompter = BootstrapFewShot(metric=metric_EM, max_bootstrapped_demos=2)
55 teleprompter = BootstrapFinetune(metric=metric_EM)
---> 56 cot_compiled = teleprompter.compile(CoT(), trainset=train_dspy_data)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/teleprompt/finetune.py:77, in BootstrapFinetune.compile(self, student, teacher, trainset, valset, target, bsize, accumsteps, lr, epochs, bf16, int8, peft, path_prefix)
73 finetune_data = {}
75 for teacher in teachers:
76 # Dummy compilation to get bootstraps.
---> 77 compiled = self.teleprompter.compile(student, teacher=teacher, trainset=trainset)
78 multitask = self.multitask
80 # Prepare finetune <prompt, completion> pairs.
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/teleprompt/bootstrap.py:55, in BootstrapFewShot.compile(self, student, teacher, trainset, valset)
53 self._prepare_student_and_teacher(student, teacher)
54 self._prepare_predictor_mappings()
---> 55 self._bootstrap()
57 self.student = self._train()
58 self.student._compiled = True
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/teleprompt/bootstrap.py:112, in BootstrapFewShot._bootstrap(self, max_bootstraps)
109 break
111 if example_idx not in bootstrapped:
--> 112 success = self._bootstrap_one_example(example, round_idx)
114 if success:
115 bootstrapped[example_idx] = True
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/teleprompt/bootstrap.py:167, in BootstrapFewShot._bootstrap_one_example(self, example, round_idx)
165 current_error_count = self.error_count
166 if current_error_count >= self.max_errors:
--> 167 raise e
168 print(f'Failed to run or to evaluate example {example} with {self.metric} due to {e}.')
170 if success:
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/teleprompt/bootstrap.py:146, in BootstrapFewShot._bootstrap_one_example(self, example, round_idx)
143 predictor_cache[name] = predictor.demos
144 predictor.demos = [x for x in predictor.demos if x != example]
--> 146 prediction = teacher(**example.inputs())
147 trace = dsp.settings.trace
149 for name, predictor in teacher.named_predictors():
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/primitives/program.py:29, in Module.__call__(self, *args, **kwargs)
28 def __call__(self, *args, **kwargs):
---> 29 return self.forward(*args, **kwargs)
Cell In[20], line 51, in CoT.forward(self, text)
50 def forward(self, text):
---> 51 return self.generate_answer(text=text)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/predict/predict.py:49, in Predict.__call__(self, **kwargs)
48 def __call__(self, **kwargs):
---> 49 return self.forward(**kwargs)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/predict/chain_of_thought.py:59, in ChainOfThought.forward(self, **kwargs)
57 signature = new_signature
58 # template = dsp.Template(self.signature.instructions, **new_signature)
---> 59 return super().forward(signature=signature, **kwargs)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dspy/predict/predict.py:90, in Predict.forward(self, **kwargs)
87 template = signature_to_template(signature)
89 if self.lm is None:
---> 90 x, C = dsp.generate(template, **config)(x, stage=self.stage)
91 else:
92 # Note: query_only=True means the instructions and examples are not included.
93 # I'm not really sure why we'd want to do that, but it's there.
94 with dsp.settings.context(lm=self.lm, query_only=True):
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dsp/primitives/predict.py:78, in _generate.<locals>.do_generate(example, stage, max_depth, original_example)
76 # Generate and extract the fields.
77 prompt = template(example)
---> 78 completions: list[dict[str, Any]] = generator(prompt, **kwargs)
79 completions: list[Example] = [template.extract(example, p) for p in completions]
81 # Find the completions that are most complete.
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dsp/modules/hf.py:144, in HFModel.__call__(self, prompt, only_completed, return_sorted, **kwargs)
141 if kwargs.get("n", 1) > 1 or kwargs.get("temperature", 0.0) > 0.1:
142 kwargs["do_sample"] = True
--> 144 response = self.request(prompt, **kwargs)
145 return [c["text"] for c in response["choices"]]
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dsp/modules/lm.py:26, in LM.request(self, prompt, **kwargs)
25 def request(self, prompt, **kwargs):
---> 26 return self.basic_request(prompt, **kwargs)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dsp/modules/hf.py:98, in HFModel.basic_request(self, prompt, **kwargs)
96 raw_kwargs = kwargs
97 kwargs = {**self.kwargs, **kwargs}
---> 98 response = self._generate(prompt, **kwargs)
100 history = {
101 "prompt": prompt,
102 "response": response,
103 "kwargs": kwargs,
104 "raw_kwargs": raw_kwargs,
105 }
106 self.history.append(history)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/dsp/modules/hf.py:123, in HFModel._generate(self, prompt, **kwargs)
120 inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
122 # print(kwargs)
--> 123 outputs = self.model.generate(**inputs, **kwargs)
124 if self.drop_prompt_from_output:
125 input_length = inputs.input_ids.shape[1]
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/generation/utils.py:1549, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1531 result = self.assisted_decoding(
1532 input_ids,
1533 candidate_generator=candidate_generator,
(...)
1545 **model_kwargs,
1546 )
1547 if generation_mode == GenerationMode.GREEDY_SEARCH:
1548 # 11. run greedy search
-> 1549 result = self.greedy_search(
1550 input_ids,
1551 logits_processor=prepared_logits_processor,
1552 stopping_criteria=prepared_stopping_criteria,
1553 pad_token_id=generation_config.pad_token_id,
1554 eos_token_id=generation_config.eos_token_id,
1555 output_scores=generation_config.output_scores,
1556 output_logits=generation_config.output_logits,
1557 return_dict_in_generate=generation_config.return_dict_in_generate,
1558 synced_gpus=synced_gpus,
1559 streamer=streamer,
1560 **model_kwargs,
1561 )
1563 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
1564 if not model_kwargs["use_cache"]:
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/generation/utils.py:2418, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2415 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2417 # forward pass to get next token
-> 2418 outputs = self(
2419 **model_inputs,
2420 return_dict=True,
2421 output_attentions=output_attentions,
2422 output_hidden_states=output_hidden_states,
2423 )
2425 if synced_gpus and this_peer_finished:
2426 continue # don't waste resources running the code we don't need
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
--> 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1157, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1154 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1156 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1157 outputs = self.model(
1158 input_ids=input_ids,
1159 attention_mask=attention_mask,
1160 position_ids=position_ids,
1161 past_key_values=past_key_values,
1162 inputs_embeds=inputs_embeds,
1163 use_cache=use_cache,
1164 output_attentions=output_attentions,
1165 output_hidden_states=output_hidden_states,
1166 return_dict=return_dict,
1167 )
1169 hidden_states = outputs[0]
1170 logits = self.lm_head(hidden_states)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1042, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
1032 layer_outputs = self._gradient_checkpointing_func(
1033 decoder_layer.__call__,
1034 hidden_states,
(...)
1039 use_cache,
1040 )
1041 else:
-> 1042 layer_outputs = decoder_layer(
1043 hidden_states,
1044 attention_mask=attention_mask,
1045 position_ids=position_ids,
1046 past_key_value=past_key_values,
1047 output_attentions=output_attentions,
1048 use_cache=use_cache,
1049 )
1051 hidden_states = layer_outputs[0]
1053 if use_cache:
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
--> 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:757, in MistralDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
754 hidden_states = self.input_layernorm(hidden_states)
756 # Self Attention
--> 757 hidden_states, self_attn_weights, present_key_value = self.self_attn(
758 hidden_states=hidden_states,
759 attention_mask=attention_mask,
760 position_ids=position_ids,
761 past_key_value=past_key_value,
762 output_attentions=output_attentions,
763 use_cache=use_cache,
764 )
765 hidden_states = residual + hidden_states
767 # Fully Connected
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
--> 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:257, in MistralAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)
252 warnings.warn(
253 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
254 )
255 bsz, q_len, _ = hidden_states.size()
--> 257 query_states = self.q_proj(hidden_states)
258 key_states = self.k_proj(hidden_states)
259 value_states = self.v_proj(hidden_states)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
163 output = module._old_forward(*args, **kwargs)
164 else:
--> 165 output = module._old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)
File /home/shared/.miniconda3/envs/py10cuda117/lib/python3.10/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
Please suggest me what should I try for my case.
This seems unrelated to DSPy and more so from the HuggingFace model loading incompatibility, but I would point you to trying TGI since HFModel is less user-friendly for fixing these errors.
Okay @arnavsinghvi11 , I'm skipping the HFModel for now. I'll use LM from vLLM server.
Hey, I'm trying to compile a
dsp
program where large text samples are being used intrainset
.trainset
contains 8-10 examples of each category.text
samples range from 50 words to 2500 words approximately.Used the below line of code to compile the model, but getting CUDA_OUT_OF_MEMORY.
cot_compiled = teleprompter.compile(CoT(), trainset=train_dspy_data)
Any idea how to resolve it ?? Any suggestion on how to use multi-gpu to resolve the issue??