Open Blacksuan19 opened 4 months ago
Hi, I see the issue is from awq:
File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 48, in reshape_for_broadcast
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
It's likely a bug in awq, perhaps when combined with attention sinks, flash attention, or compile of model. While we expose those options from transformers, I cannot be sure arbitrary combinations work.
If I run:
python generate.py \
--enable_heap_analytics=False \
--document_choice_in_sidebar=True \
--actions_in_sidebar=True \
--openai_server=False \
--use_gpu_id=False \
--score_model=None \
--prompt_type=open_chat \
--base_model=TheBloke/openchat_3.5-16k-AWQ \
--compile_model=True \
--use_cache=True \
--use_flash_attention_2=True \
--attention_sinks=True \
--sink_dict="{'num_sink_tokens': 4, 'window_length': 16384 }" \
--use_llm_if_no_docs=True \
--max_seq_len=16384 \
--enable_ocr=True
I don't have a generic issue running. I removed things that shouldn't be relevant to the awq issue.
However, when I upload some text and then ask a question, I get the same issue:
/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `__call__` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.
warn_deprecated(
thread exception: Traceback (most recent call last):
File "/home/jon/h2ogpt/src/utils.py", line 502, in run
self._return = self._target(*self._args, **self._kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper
return wrapped(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__
return self.invoke(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke
raise e
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke
self._call(inputs, run_manager=run_manager)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/base.py", line 137, in _call
output, extra_return_dict = self.combine_docs(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/stuff.py", line 244, in combine_docs
return self.llm_chain.predict(callbacks=callbacks, **inputs), {}
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 293, in predict
return self(kwargs, callbacks=callbacks)[self.output_key]
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper
return wrapped(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__
return self.invoke(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke
raise e
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke
self._call(inputs, run_manager=run_manager)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 103, in _call
response = self.generate([inputs], run_manager=run_manager)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 115, in generate
return self.llm.generate_prompt(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 633, in generate_prompt
return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 803, in generate
output = self._generate_helper(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 670, in _generate_helper
raise e
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 657, in _generate_helper
self._generate(
File "/home/jon/h2ogpt/src/gpt_langchain.py", line 2339, in _generate
rets = super()._generate(prompts, stop=stop, run_manager=run_manager, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/langchain_community/llms/huggingface_pipeline.py", line 267, in _generate
responses = self.pipeline(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/text_generation.py", line 240, in __call__
return super().__call__(text_inputs, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1223, in __call__
outputs = list(final_iterator)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 124, in __next__
item = next(self.iterator)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 125, in __next__
processed = self.infer(item, **self.params)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1149, in forward
model_outputs = self._forward(model_inputs, **forward_params)
File "/home/jon/h2ogpt/src/h2oai_pipeline.py", line 271, in _forward
return self.__forward(model_inputs, **generate_kwargs)
File "/home/jon/h2ogpt/src/h2oai_pipeline.py", line 309, in __forward
generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 1576, in generate
result = self._greedy_search(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 2494, in _greedy_search
outputs = self(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1158, in forward
outputs = self.model(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/model.py", line 119, in forward
h, _, past_key_value = layer(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/block.py", line 113, in forward
attn_output, _, past_key_value = self.attn.forward(
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 210, in forward
xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 62, in forward
freqs_cis = self.reshape_for_broadcast(freqs_cis, xq_).to(xq_.device)
File "/home/jon/miniconda3/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 50, in reshape_for_broadcast
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
AssertionError
This does the same thing:
python generate.py \
--enable_heap_analytics=False \
--document_choice_in_sidebar=True \
--actions_in_sidebar=True \
--openai_server=False \
--use_gpu_id=False \
--score_model=None \
--prompt_type=open_chat \
--base_model=TheBloke/openchat_3.5-16k-AWQ \
--attention_sinks=True \
--sink_dict="{'num_sink_tokens': 4, 'window_length': 16384 }" \
--use_llm_if_no_docs=True \
--max_seq_len=16384 \
--enable_ocr=True
As does this:
python generate.py \
--enable_heap_analytics=False \
--document_choice_in_sidebar=True \
--actions_in_sidebar=True \
--openai_server=False \
--use_gpu_id=False \
--score_model=None \
--prompt_type=open_chat \
--base_model=TheBloke/openchat_3.5-16k-AWQ \
--use_llm_if_no_docs=True \
--max_seq_len=16384 \
--enable_ocr=True
So it seems to be a pure awq issue.
The latest 0.2.5 does the same thing. Reducing to (say) 15000 does same thing.
A small script does the same thing, so it's not related to h2oGPT itself.
I'm getting a similar error with LLAMA-3 GGUF as well (same model mentioned in the FAQ), it only includes the evaluate_nochat exception from the log above.
It should fail and make you pass:
--max_seq_len=8192
as well.
e.g.
python generate.py --openai_server=False --score_model=None --base_model=llama --model_path_llama=https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf?download=true --tokenizer_base_model=meta-llama/Meta-Llama-3-8B-Instruct --use_llm_if_no_docs=True --max_seq_len=8192
gives:
I don't see the error you see. And when I debug the code with the above command on latest h2oGPT, I see that chat_template is True.
Perhaps you are using older docker image or older h2oGPT or something?
there was a missing slash in the command after --model_path_llama
, fixed now, however I'm still unable to run LLAMA-3 with the docker image due to it being a gated model, I tried passing --use_auth_token=hf_xxx
and setting environment variables HUGGING_FACE_HUB_TOKEN
and HF_TOKEN
and HUGGINGFACE_TOKEN
but still cannot access.
this occurs on both latest and previous docker images with tags c25144e9
and 4059a2c9
Can you share a stack trace of where it's failing?
here is the full stack trace, I'm authenticated on the huggingface-cli, have exported HUGGING_FACE_HUB_TOKEN
in the environment and passing --use_auth_token
to docker run
the error is not raised while starting the container, only after sending a query is the error raised.
I see. But if you pass the env HUGGING_FACE_HUB_TOKEN
through it should still work here.
e.g. the docker line would add:
-e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
But, I made some changes for that particular piece of code.
I can confirm passing the environment variable to the docker image with -e
or passing --use_auth_token
also works on the latest image.
when using the docker image, I randomly get assertion errors when making a request from the gradio UI, sometimes it works and sometimes it does not, here is the raised error.
this occurs with the latest two docker images tagged
4059a2c9
and7297519c
.Full error
```python thread exception: Traceback (most recent call last): File "/workspace/src/utils.py", line 502, in run self._return = self._target(*self._args, **self._kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper return wrapped(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__ return self.invoke( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke self._call(inputs, run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/base.py", line 137, in _call output, extra_return_dict = self.combine_docs( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/stuff.py", line 244, in combine_docs return self.llm_chain.predict(callbacks=callbacks, **inputs), {} File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 293, in predict return self(kwargs, callbacks=callbacks)[self.output_key] File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper return wrapped(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__ return self.invoke( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke self._call(inputs, run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 103, in _call response = self.generate([inputs], run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 115, in generate return self.llm.generate_prompt( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 633, in generate_prompt return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 803, in generate output = self._generate_helper( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 670, in _generate_helper raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 657, in _generate_helper self._generate( File "/workspace/src/gpt_langchain.py", line 2339, in _generate rets = super()._generate(prompts, stop=stop, run_manager=run_manager, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_community/llms/huggingface_pipeline.py", line 267, in _generate responses = self.pipeline( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/text_generation.py", line 240, in __call__ return super().__call__(text_inputs, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1223, in __call__ outputs = list(final_iterator) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 124, in __next__ item = next(self.iterator) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 125, in __next__ processed = self.infer(item, **self.params) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1149, in forward model_outputs = self._forward(model_inputs, **forward_params) File "/workspace/src/h2oai_pipeline.py", line 271, in _forward return self.__forward(model_inputs, **generate_kwargs) File "/workspace/src/h2oai_pipeline.py", line 309, in __forward generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 1576, in generate result = self._greedy_search( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 2494, in _greedy_search outputs = self( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1158, in forward outputs = self.model( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/model.py", line 127, in forward h, _, _ = layer( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/block.py", line 123, in forward attn_output, _, past_key_value = self.attn.forward( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 235, in forward xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 60, in forward freqs_cis = self.reshape_for_broadcast(freqs_cis, xq_).to(xq_.device) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 48, in reshape_for_broadcast assert freqs_cis.shape == (x.shape[1], x.shape[-1]) AssertionError make stop: Traceback (most recent call last): File "/workspace/src/utils.py", line 502, in run self._return = self._target(*self._args, **self._kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper return wrapped(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__ return self.invoke( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke self._call(inputs, run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/base.py", line 137, in _call output, extra_return_dict = self.combine_docs( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/combine_documents/stuff.py", line 244, in combine_docs return self.llm_chain.predict(callbacks=callbacks, **inputs), {} File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 293, in predict return self(kwargs, callbacks=callbacks)[self.output_key] File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/_api/deprecation.py", line 145, in warning_emitting_wrapper return wrapped(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 378, in __call__ return self.invoke( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 163, in invoke raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/base.py", line 153, in invoke self._call(inputs, run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 103, in _call response = self.generate([inputs], run_manager=run_manager) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain/chains/llm.py", line 115, in generate return self.llm.generate_prompt( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 633, in generate_prompt return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 803, in generate output = self._generate_helper( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 670, in _generate_helper raise e File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_core/language_models/llms.py", line 657, in _generate_helper self._generate( File "/workspace/src/gpt_langchain.py", line 2339, in _generate rets = super()._generate(prompts, stop=stop, run_manager=run_manager, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/langchain_community/llms/huggingface_pipeline.py", line 267, in _generate responses = self.pipeline( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/text_generation.py", line 240, in __call__ return super().__call__(text_inputs, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1223, in __call__ outputs = list(final_iterator) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 124, in __next__ item = next(self.iterator) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py", line 125, in __next__ processed = self.infer(item, **self.params) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1149, in forward model_outputs = self._forward(model_inputs, **forward_params) File "/workspace/src/h2oai_pipeline.py", line 271, in _forward return self.__forward(model_inputs, **generate_kwargs) File "/workspace/src/h2oai_pipeline.py", line 309, in __forward generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 1576, in generate result = self._greedy_search( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/generation/utils.py", line 2494, in _greedy_search outputs = self( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py", line 1158, in forward outputs = self.model( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/model.py", line 127, in forward h, _, _ = layer( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/block.py", line 123, in forward attn_output, _, past_key_value = self.attn.forward( File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 235, in forward xq, xk = self.rope.forward(xq, xk, self.start_pos, seqlen) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 60, in forward freqs_cis = self.reshape_for_broadcast(freqs_cis, xq_).to(xq_.device) File "/h2ogpt_conda/envs/h2ogpt/lib/python3.10/site-packages/awq/modules/fused/attn.py", line 48, in reshape_for_broadcast assert freqs_cis.shape == (x.shape[1], x.shape[-1]) AssertionError hit stop evaluate_nochat exception: : ('', '', '', True, 'open_chat', "{ 'PreInput': None,\n 'PreInstruct': 'GPT4 User: ',\n 'PreResponse': 'GPT4 Assistant:',\n 'botstr': 'GPT4 Assistant:',\n 'can_handle_system_prompt': False,\n 'chat_sep': '<|end_of_turn|>',\n 'chat_turn_sep': '<|end_of_turn|>',\n 'generates_leading_space': False,\n 'humanstr': 'GPT4 User: ',\n 'promptA': '',\n 'promptB': '',\n 'system_prompt': '',\n 'terminate_response ': ['GPT4 Assistant:', '<|end_of_turn|>']}", 0, 1, 1, 0, 1, 1024, 0, False, 600, 1.07, 1, False, 0.0, True, '', '', 'UserData', True, 'Query', [], 10, True, 512, 'Relevant', ['/workspace/user_path/9b999f43-2ade-4148-97cf-d2448125168c/r es/e6a9ce98_user_upload_protocols.pdf'], [], [], [], [], 'Pay attention and remember the information below, which will help to answer the question or imperative after the context ends.', 'According to only the information in the docume nt sources provided within the context above, write an insightful and well-structured response to: ', 'In order to write a concise single-paragraph or bulleted list summary, pay attention to the following text.', 'Using only the inform ation in the document sources above, write a condensed and concise summary of key results (preferably as about 10 bullet points).', 'Answer this question with vibrant details in order for some NLP embedding model to use that answer as better query than original question: ', 'Who are you and what do you do?', 'Ensure your entire response is outputted as a single piece of strict valid JSON text.', 'Ensure your response is strictly valid JSON text.', 'Ensure your entir e response is outputted as strict valid JSON text inside a Markdown code block with the json language identifier. Ensure all JSON keys are less than 64 characters, and ensure JSON key names are made of only alphanumerics, underscores , or hyphens.', 'Ensure you follow this JSON schema:\n```json\n{properties_schema}\n```', 'auto', ['OCR', 'DocTR', 'Caption', 'ASR'], ['PyPDF'], ['Unstructured'], '.[]', 10, 'auto', [], [], '', False, '[]', '[]', 'best_near_prompt', 51 2, -1.0, -1.0, 'split_or_merge', '\n\n', 0, 'auto', False, False, '[]', 'None', None, [], 1.0, None, None, 'text', '', '', '', '', {'model': 'model', 'tokenizer': 'tokenizer', 'device': 'cuda', 'base_model': 'TheBloke/openchat_3.5-16k- AWQ', 'tokenizer_base_model': '', 'lora_weights': '[]', 'inference_server': '[]', 'prompt_type': 'open_chat', 'prompt_dict': {'promptA': '', 'promptB': '', 'PreInstruct': 'GPT4 User: ', 'PreInput': None, 'PreResponse': 'GPT4 Assistant: ', 'terminate_response': ['GPT4 Assistant:', '<|end_of_turn|>'], 'chat_sep': '<|end_of_turn|>', 'chat_turn_sep': '<|end_of_turn|>', 'humanstr': 'GPT4 User: ', 'botstr': 'GPT4 Assistant:', 'generates_leading_space': False, 'system_promp t': '', 'can_handle_system_prompt': False}, 'visible_models': 0, 'h2ogpt_key': None}, {'MyData': [None, '90711427-650d-458e-ac69-bc1629b452be', 'test']}, {'langchain_modes': ['Disabled', 'LLM', 'UserData'], 'langchain_mode_paths': {'Us erData': '/workspace/user_path/'}, 'langchain_mode_types': {'UserData': 'shared', 'github h2oGPT': 'shared', 'DriverlessAI docs': 'shared', 'wiki': 'shared', 'wiki_full': '', 'LLM': 'personal', 'Disabled': 'personal'}}, {'headers': '', 'host': '0.0.0.0:7850', 'username': 'test', 'connection': 'Upgrade', 'pragma': 'no-cache', 'cache-control': 'no-cache', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/ 537.36 Edg/124.0.0.0', 'upgrade': 'websocket', 'origin': 'http://0.0.0.0:7850', 'sec-websocket-version': '13', 'accept-encoding': 'gzip, deflate', 'accept-language': 'en-US,en;q=0.9,ar;q=0.8', 'cookie': 'access-token-unsecure-hhN8p y5JLVRfL-0OTPND8TGcb3qhs2GvSJQ8qV1LI50=vrLRNuXKqoKCZDSCqo1OHg; access-token-unsecure-s-dRx26Pws-xf2TfvaYIjqwWsGjiH9960S06PrlT6tg=AnrezJi1hR1NjfFx29n_bg; access-token-unsecure-SF0CZ7POfi6Imk0jDfN44qO9W9VB0hu3nUcGevVPMYw=SU1SQYZL79hpAN43 hEDgIQ; access-token-unsecure-9LIDZewsE4If1yY7ixHa-yOZJO20M-PQVSDjJtfYQYA=o8YMAhHGtoLQDjMVZVITsQ; access-token-unsecure-qS0zsQdPdQYJsrMX4RXh3HQwEDeknaNz0RppngdPvGY=AGmuVQm8_KVKkMg8HdQtqg; access-token-unsecure--qfFGcbj-JQc0O0MamjIfNGlf gUrb6t7xyB3hRUL1I8=NVbKjP5O7Q3xJxHYvaiUfw; access-token-unsecure-YeY4iDfE2-hlA1izGtL7vBNbLbCosRLpSAJFo-j6_e0=xkWJTIiCTZGbhG1H60OTBg; access-token-unsecure-BwVTmtTwIzOYqtTpvsZkHQvnjr8N60WJaX_V6njwUAw=8uPW51j557W7S8ZO_e5iSQ', 'sec-websoc ket-key': 'CS4lXFJi7AM2jwkdWyhKyQ==', 'sec-websocket-extensions': 'permessage-deflate; client_max_window_bits', 'host2': '14.1.206.49', 'picture': 'None'}, {}, [['summarize the given document', '']]) ```Docker command
used command to run h2ogpt ```bash export CONTEXT_LENGTH=16384 export IMAGE_TAG=7297519c docker run \ --init \ --gpus all \ --runtime=nvidia \ --shm-size=2g \ -p 7850:7860 \ -v /etc/passwd:/etc/passwd:ro \ -v /etc/group:/etc/group:ro \ -u $(id -u):$(id -g) \ gcr.io/vorvan/h2oai/h2ogpt-runtime:$IMAGE_TAG /workspace/generate.py \ --page_title="GenNet AI" \ --favicon_path="/workspace/assets/gennet_logo.svg" \ --height=700 \ --gradio_size="medium" \ --enable_heap_analytics=False \ --document_choice_in_sidebar=True \ --actions_in_sidebar=True \ --openai_server=False \ --use_gpu_id=False \ --score_model=None \ --prompt_type=open_chat \ --base_model=TheBloke/openchat_3.5-16k-AWQ \ --compile_model=True \ --use_cache=True \ --use_flash_attention_2=True \ --attention_sinks=True \ --sink_dict="{'num_sink_tokens': 4, 'window_length': $CONTEXT_LENGTH }" \ --save_dir='/workspace/save/' \ --user_path='/workspace/user_path/' \ --langchain_mode="UserData" \ --langchain_modes="['UserData', 'LLM']" \ --visible_langchain_actions="['Query']" \ --visible_langchain_agents="[]" \ --use_llm_if_no_docs=True \ --max_seq_len=$CONTEXT_LENGTH \ --enable_ocr=True \ --enable_tts=False \ --enable_stt=False ```