Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/gradio/routes.py", line 437, in run_predict
output = await app.get_blocks().process_api(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1352, in process_api
result = await self.call_function(
File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1093, in call_function
prediction = await utils.async_iteration(iterator)
File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 341, in async_iteration
return await iterator.__anext__()
File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 334, in __anext__
return await anyio.to_thread.run_sync(
File "/home/kula/.local/lib/python3.10/site-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/home/kula/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/home/kula/.local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/dist-packages/gradio/utils.py", line 317, in run_sync_iterator_async
return next(iterator)
File "/srv/chatglm/WebGLM/web_demo.py", line 49, in query
for resp in webglm.stream_query(query):
File "/srv/chatglm/WebGLM/model/modeling_webglm.py", line 35, in stream_query
refs = self.ref_retriever.query(question)
File "/srv/chatglm/WebGLM/model/retriever/__init__.py", line 50, in query
return self.filter.produce_references(question, data_list, 5)
File "/srv/chatglm/WebGLM/model/retriever/filtering/contriver.py", line 82, in produce_references
topk = self.scorer.select_topk(query, texts, topk)
File "/srv/chatglm/WebGLM/model/retriever/filtering/contriver.py", line 69, in select_topk
scores.append(self.score_documents_on_query(query, documents[self.max_batch_size*i:self.max_batch_size*(i+1)]).to('cpu'))
File "/srv/chatglm/WebGLM/model/retriever/filtering/contriver.py", line 58, in score_documents_on_query
query_embedding = self.get_query_embeddings([query])[0]
File "/srv/chatglm/WebGLM/model/retriever/filtering/contriver.py", line 29, in get_query_embeddings
outputs = self.query_encoder(**inputs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_bert.py", line 993, in forward
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 893, in get_extended_attention_mask
extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
跑了web_demo.py 报错了