No compiled kernel found.

Is there an existing issue for this?

[X] I have searched the existing issues

Current Behavior

D:\Anaconda\envs\langchain\python.exe E:/langchain-ChatGLM-master/cli_demo.py D:\Anaconda\envs\langchain\lib\site-packages\numpy_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs: D:\Anaconda\envs\langchain\lib\site-packages\numpy.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll D:\Anaconda\envs\langchain\lib\site-packages\numpy.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll warnings.warn("loaded more than 1 DLL from .libs:" INFO 2023-07-04 20:10:16,665-1d: loading model config llm device: cuda embedding device: cuda dir: E:\langchain-ChatGLM-master flagging username: 3f1fea213a014dfeab250a87ae629172

INFO 2023-07-04 20:10:17,661-1d: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. INFO 2023-07-04 20:10:17,661-1d: NumExpr defaulting to 8 threads. D:\Anaconda\envs\langchain\lib\site-packages\pkg_resources__init.py:121: DeprecationWarning: pkg_resources is deprecated as an API warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning) D:\Anaconda\envs\langchain\lib\site-packages\pkg_resources__init__.py:2870: DeprecationWarning: Deprecated call to pkg_resources.declare_namespace('mpl_toolkits'). Implementing implicit namespace packages (as specified in PEP 420) is preferred to pkg_resources.declare_namespace. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages declare_namespace(pkg) D:\Anaconda\envs\langchain\lib\site-packages\pkg_resources\init.py:2870: DeprecationWarning: Deprecated call to pkg_resources.declare_namespace('google'). Implementing implicit namespace packages (as specified in PEP 420) is preferred to pkg_resources.declare_namespace. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages declare_namespace(pkg) D:\Anaconda\envs\langchain\lib\site-packages\pkg_resources\init__.py:2870: DeprecationWarning: Deprecated call to pkg_resources.declare_namespace('zope'). Implementing implicit namespace packages (as specified in PEP 420) is preferred to pkg_resources.declare_namespace. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages declare_namespace(pkg)

Expected Behavior

No response

Steps To Reproduce

Loading E:\langchain-ChatGLM-master\GLM-int4... num_gpus 1 No compiled kernel found. Compiling kernels : C:\Users\11025.cache\huggingface\modules\transformers_modules\GLM-int4\quantization_kernels_parallel.c Compiling gcc -O3 -fPIC -pthread -fopenmp -std=c99 C:\Users\11025.cache\huggingface\modules\transformers_modules\GLM-int4\quantization_kernels_parallel.c -shared -o C:\Users\11025.cache\huggingface\modules\transformers_modules\GLM-int4\quantization_kernels_parallel.so --- Logging error --- Traceback (most recent call last): File "C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\quantization.py", line 19, in from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels__init.py", line 1, in from . import library File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels\library__init.py", line 1, in from . import nvrtc File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels\library\nvrtc.py", line 5, in nvrtc = Lib("nvrtc") File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels\library\base.py", line 45, in init__ lib_path = windows_find_lib(self.name) File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels\library\base.py", line 39, in windows_find_lib return lookup_dll(lib_name) File "D:\Anaconda\envs\langchain\lib\site-packages\cpm_kernels\library\base.py", line 16, in lookup_dll for name in os.listdir(path): NotADirectoryError: [WinError 267] 目录名称无效。: 'D:\tomcat8\lib\servlet-api.jar'

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "D:\Anaconda\envs\langchain\lib\logging__init.py", line 1085, in emit msg = self.format(record) File "D:\Anaconda\envs\langchain\lib\logging__init.py", line 929, in format return fmt.format(record) File "D:\Anaconda\envs\langchain\lib\logging\init.py", line 668, in format record.message = record.getMessage() File "D:\Anaconda\envs\langchain\lib\logging\init.py", line 373, in getMessage msg = msg % self.args TypeError: not all arguments converted during string formatting Call stack: File "E:/langchain-ChatGLM-master/cli_demo.py", line 66, in main() File "E:/langchain-ChatGLM-master/cli_demo.py", line 16, in main llm_model_ins = shared.loaderLLM() File "E:\langchain-ChatGLM-master\models\shared.py", line 40, in loaderLLM loaderCheckPoint.reload_model() File "E:\langchain-ChatGLM-master\models\loader\loader.py", line 431, in reload_model self.model, self.tokenizer = self._load_model(self.model_name) File "E:\langchain-ChatGLM-master\models\loader\loader.py", line 123, in _load_model LoaderClass.from_pretrained(checkpoint, File "D:\Anaconda\envs\langchain\lib\site-packages\transformers\models\auto\auto_factory.py", line 462, in from_pretrained return model_class.from_pretrained( File "D:\Anaconda\envs\langchain\lib\site-packages\transformers\modeling_utils.py", line 2611, in from_pretrained model = cls(config, *model_args, **model_kwargs) File "C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\modeling_chatglm.py", line 1061, in init__ self.quantize(self.config.quantization_bit, self.config.quantization_embeddings, use_quantization_cache=True, empty_init=True) File "C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\modeling_chatglm.py", line 1424, in quantize from .quantization import quantize, QuantizedEmbedding, QuantizedLinear, load_cpu_kernel File "", line 991, in _find_and_load File "", line 975, in _find_and_load_unlocked File "", line 671, in _load_unlocked File "", line 843, in exec_module File "", line 219, in _call_with_frames_removed File "C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\quantization.py", line 46, in logger.warning("Failed to load cpm_kernels:", exception) Message: 'Failed to load cpm_kernels:' Arguments: (NotADirectoryError(20, '目录名称无效。'),) Compile default cpu kernel failed, using default cpu kernel code. Compiling gcc -O3 -fPIC -std=c99 C:\Users\11025.cache\huggingface\modules\transformers_modules\GLM-int4\quantization_kernels.c -shared -o C:\Users\11025.cache\huggingface\modules\transformers_modules\GLM-int4\quantization_kernels.so Compile default cpu kernel failed. Failed to load kernel. Cannot load cpu or cuda kernel, quantization failed: 'gcc' ��ڲ��ⲿ��Ҳ��ǿ��еĳ�� ļ�� 'gcc' ��ڲ��ⲿ��Ҳ��ǿ��еĳ�� ļ�� ┌───────────────────── Traceback (most recent call last) ─────────────────────┐ │ E:/langchain-ChatGLM-master/cli_demo.py:66 in │ │ │ │ 63 │ args = parser.parse_args() │ │ 64 │ args_dict = vars(args) │ │ 65 │ shared.loaderCheckPoint = LoaderCheckPoint(args_dict) │ │ > 66 │ main() │ │ 67 │ │ │ │ E:/langchain-ChatGLM-master/cli_demo.py:16 in main │ │ │ │ 13 │ │ 14 def main(): │ │ 15 │ │ │ > 16 │ llm_model_ins = shared.loaderLLM() │ │ 17 │ llm_model_ins.history_len = LLM_HISTORY_LEN │ │ 18 │ │ │ 19 │ local_doc_qa = LocalDocQA() │ │ │ │ E:\langchain-ChatGLM-master\models\shared.py:40 in loaderLLM │ │ │ │ 37 │ if 'FastChatOpenAILLM' in llm_model_info["provides"]: │ │ 38 │ │ loaderCheckPoint.unload_model() │ │ 39 │ else: │ │ > 40 │ │ loaderCheckPoint.reload_model() │ │ 41 │ │ │ 42 │ provides_class = getattr(sys.modules['models'], llm_model_info['pr │ │ 43 │ modelInsLLM = provides_class(checkPoint=loaderCheckPoint) │ │ │ │ E:\langchain-ChatGLM-master\models\loader\loader.py:431 in reload_model │ │ │ │ 428 │ │ │ except Exception as e: │ │ 429 │ │ │ │ print("加载PrefixEncoder config.json失败") │ │ 430 │ │ │ │ > 431 │ │ self.model, self.tokenizer = self._load_model(self.model_name │ │ 432 │ │ │ │ 433 │ │ if self.lora: │ │ 434 │ │ │ self._add_lora_to_model([self.lora]) │ │ │ │ E:\langchain-ChatGLM-master\models\loader\loader.py:123 in _load_model │ │ │ │ 120 │ │ │ │ print("num_gpus",num_gpus) │ │ 121 │ │ │ │ if num_gpus < 2 and self.device_map is None: │ │ 122 │ │ │ │ │ model = ( │ │ > 123 │ │ │ │ │ │ LoaderClass.from_pretrained(checkpoint, │ │ 124 │ │ │ │ │ │ │ │ │ │ │ │ │ config=self.model │ │ 125 │ │ │ │ │ │ │ │ │ │ │ │ │ torch_dtype=torch │ │ 126 │ │ │ │ │ │ │ │ │ │ │ │ │ trust_remotecode │ │ │ │ D:\Anaconda\envs\langchain\lib\site-packages\transformers\models\auto\auto │ │ factory.py:462 in from_pretrained │ │ │ │ 459 │ │ │ model_class = get_class_from_dynamic_module( │ │ 460 │ │ │ │ class_ref, pretrained_model_name_or_path, hub_kwarg │ │ 461 │ │ │ ) │ │ > 462 │ │ │ return model_class.from_pretrained( │ │ 463 │ │ │ │ pretrained_model_name_or_path, model_args, config=co │ │ 464 │ │ │ ) │ │ 465 │ │ elif type(config) in cls._model_mapping.keys(): │ │ │ │ D:\Anaconda\envs\langchain\lib\site-packages\transformers\modeling_utils.py │ │ :2611 in from_pretrained │ │ │ │ 2608 │ │ │ init_contexts.append(init_empty_weights()) │ │ 2609 │ │ │ │ 2610 │ │ with ContextManagers(init_contexts): │ │ > 2611 │ │ │ model = cls(config, model_args, model_kwargs) │ │ 2612 │ │ │ │ 2613 │ │ # Check first if we are from_pt │ │ 2614 │ │ if use_keep_in_fp32_modules: │ │ │ │ C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\mod │ │ eling_chatglm.py:1061 in init__ │ │ │ │ 1058 │ │ self.quantized = False │ │ 1059 │ │ │ │ 1060 │ │ if self.config.quantization_bit: │ │ > 1061 │ │ │ self.quantize(self.config.quantization_bit, self.config. │ │ 1062 │ │ │ 1063 │ def get_output_embeddings(self): │ │ 1064 │ │ return self.lm_head │ │ │ │ C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\mod │ │ eling_chatglm.py:1439 in quantize │ │ │ │ 1436 │ │ self.config.quantization_bit = bits │ │ 1437 │ │ self.config.quantization_embeddings = quantize_embeddings │ │ 1438 │ │ │ │ > 1439 │ │ self.transformer = quantize(self.transformer, bits, use_quan │ │ 1440 │ │ │ │ 1441 │ │ if self.device == torch.device("cpu"): │ │ 1442 │ │ │ dtype = torch.float32 │ │ │ │ C:\Users\11025/.cache\huggingface\modules\transformers_modules\GLM-int4\qua │ │ ntization.py:464 in quantize │ │ │ │ 461 │ if not cpu_kernels.load: │ │ 462 │ │ if kernels is None: # CUDA kernels failed │ │ 463 │ │ │ print("Cannot load cpu or cuda kernel, quantization faile │ │ > 464 │ │ │ assert kernels is not None │ │ 465 │ │ print("Cannot load cpu kernel, don't use quantized model on c │ │ 466 │ │ │ 467 │ current_device = model.device │ └─────────────────────────────────────────────────────────────────────────────┘ AssertionError

Process finished with exit code 1

Environment

- OS:win11
- Python:3.8
- Transformers:4.29
- PyTorch:2.0
- CUDA Support (`python -c "import torch; print(torch.cuda.is_available())"`) :True

Anything else?

No response

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization.py", line 34, in <module>
    kernels = CPUKernel(
              ^^^^^^^^^
NameError: name 'CPUKernel' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\anaconda3\envs\chatglm\Lib\logging\__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\chatglm\Lib\logging\__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\chatglm\Lib\logging\__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\chatglm\Lib\logging\__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "E:\ChatGLM-6B\cli_demo.py", line 8, in <module>
    model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).float()
  File "D:\anaconda3\envs\chatglm\Lib\site-packages\transformers\models\auto\auto_factory.py", line 466, in from_pretrained
    return model_class.from_pretrained(
  File "D:\anaconda3\envs\chatglm\Lib\site-packages\transformers\modeling_utils.py", line 2498, in from_pretrained
    model = cls(config, *model_args, **model_kwargs)
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\modeling_chatglm.py", line 1061, in __init__
    self.quantize(self.config.quantization_bit, self.config.quantization_embeddings, use_quantization_cache=True, empty_init=True)
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\modeling_chatglm.py", line 1424, in quantize
    from .quantization import quantize, QuantizedEmbedding, QuantizedLinear, load_cpu_kernel
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization.py", line 46, in <module>
    logger.warning("Failed to load cpm_kernels:", exception)
Message: 'Failed to load cpm_kernels:'
Arguments: (NameError("name 'CPUKernel' is not defined"),)
No compiled kernel found.
Compiling kernels : C:\Users\hei\.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization_kernels_parallel.c
Compiling gcc -O3 -fPIC -pthread -fopenmp -std=c99 C:\Users\hei\.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization_kernels_parallel.c -shared -o C:\Users\hei\.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization_kernels_parallel.so
Compile default cpu kernel failed, using default cpu kernel code.
Compiling gcc -O3 -fPIC -std=c99 C:\Users\hei\.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization_kernels.c -shared -o C:\Users\hei\.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization_kernels.so
'gcc' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
'gcc' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���
Traceback (most recent call last):
  File "E:\ChatGLM-6B\cli_demo.py", line 8, in <module>
    model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True).float()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\chatglm\Lib\site-packages\transformers\models\auto\auto_factory.py", line 466, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\envs\chatglm\Lib\site-packages\transformers\modeling_utils.py", line 2498, in from_pretrained
    model = cls(config, *model_args, **model_kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\modeling_chatglm.py", line 1061, in __init__
    self.quantize(self.config.quantization_bit, self.config.quantization_embeddings, use_quantization_cache=True, empty_init=True)
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\modeling_chatglm.py", line 1439, in quantize
    self.transformer = quantize(self.transformer, bits, use_quantization_cache=use_quantization_cache, empty_init=empty_init, **kwargs)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hei/.cache\huggingface\modules\transformers_modules\THUDM\chatglm-6b-int4\6c5205c47d0d2f7ea2e44715d279e537cae0911f\quantization.py", line 464, in quantize
    assert kernels is not None
           ^^^^^^^^^^^^^^^^^^^
AssertionError
Compile default cpu kernel failed.
Failed to load kernel.
Cannot load cpu or cuda kernel, quantization failed:

Environment

`- OS:win11

Python:3.11.5
Transformers:4.27.1
PyTorch:pip3 install torch torchvision torchaudio
cpu运行`

THUDM / ChatGLM-6B