Traceback (most recent call last):
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/pred.py", line 327, in
preds = get_pred(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/pred.py", line 260, in get_pred
output = searcher.generate(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/greedy_search.py", line 32, in generate
result = self._decode(input_ids, kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/greedy_search.py", line 54, in _decode
out = self.model(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1162, in forward
outputs = self.model(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/patch.py", line 102, in model_forward
layer_outputs = decoder_layer(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 757, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/patch.py", line 16, in hf_forward
ret = forward(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/inf_llm.py", line 64, in forward
o = past_key_value.append(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 774, in append
chunk_o, local_score = self._append(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 520, in _append
global_h_k, global_h_v, global_sliding_window, global_block_map, global_block_num = self.get_global_hidden_and_mask(local_h_q.size(-2), block_topk)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 419, in get_global_hidden_and_mask
self.global_blocks[u][b_idx].load((global_h_k[u, :, st:ed, :], global_h_v[u, :, st:ed, :]))
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 76, in load
gpu_data, gpu_data_id = self.cache.alloc()
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 19, in alloc
assert len(self.idle_set) > 0
AssertionError
Traceback (most recent call last): File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/pred.py", line 327, in
preds = get_pred(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/pred.py", line 260, in get_pred
output = searcher.generate(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/greedy_search.py", line 32, in generate
result = self._decode(input_ids, kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/greedy_search.py", line 54, in _decode
out = self.model(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1162, in forward
outputs = self.model(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, *kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/patch.py", line 102, in model_forward
layer_outputs = decoder_layer(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 757, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/home/llm/miniconda3/envs/llm_infer/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(args, kwargs)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/utils/patch.py", line 16, in hf_forward
ret = forward(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/inf_llm.py", line 64, in forward
o = past_key_value.append(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 774, in append
chunk_o, local_score = self._append(
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 520, in _append
global_h_k, global_h_v, global_sliding_window, global_block_map, global_block_num = self.get_global_hidden_and_mask(local_h_q.size(-2), block_topk)
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 419, in get_global_hidden_and_mask
self.global_blocks[u][b_idx].load((global_h_k[u, :, st:ed, :], global_h_v[u, :, st:ed, :]))
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 76, in load
gpu_data, gpu_data_id = self.cache.alloc()
File "/dataset-vlm/jingyaoli/LLMInfer/InfLLM/benchmark/inf_llm/attention/context_manager.py", line 19, in alloc
assert len(self.idle_set) > 0
AssertionError