4bit版本使用modelscope拉取，inference时一直显示获取模型版本失败

bug:

Traceback (most recent call last):
  File "/DATA/DATA1/renxiaoyu/202403_mllm/code/internlm-xcomposer_inference.py", line 33, in <module>
    model = InternLMXComposer2QForCausalLM.from_quantized(model_dir, trust_remote_code=True, device="cuda:0").eval()
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/auto_gptq/modeling/_base.py", line 1028, in from_quantized
    model = AutoModelForCausalLM.from_config(
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 438, in from_config
    return model_class._from_config(config, **kwargs)
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/transformers/modeling_utils.py", line 1334, in _from_config
    model = cls(config, **kwargs)
  File "/home/huiyu/.cache/huggingface/modules/transformers_modules/internlm-xcomposer2-vl-7b-4bit/modeling_internlm_xcomposer2.py", line 67, in __init__
    self.vit = build_vision_tower()
  File "/home/huiyu/.cache/huggingface/modules/transformers_modules/internlm-xcomposer2-vl-7b-4bit/build_mlp.py", line 11, in build_vision_tower
    return CLIPVisionTower(vision_tower)
  File "/home/huiyu/.cache/huggingface/modules/transformers_modules/internlm-xcomposer2-vl-7b-4bit/build_mlp.py", line 58, in __init__
    self.load_model()
  File "/home/huiyu/.cache/huggingface/modules/transformers_modules/internlm-xcomposer2-vl-7b-4bit/build_mlp.py", line 62, in load_model
    self.vision_tower = CLIPVisionModel.from_pretrained(
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/modelscope/utils/hf_util.py", line 68, in from_pretrained
    model_dir = snapshot_download(
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/modelscope/hub/snapshot_download.py", line 117, in snapshot_download
    revision_detail = _api.get_valid_revision_detail(
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/modelscope/hub/api.py", line 494, in get_valid_revision_detail
    all_branches_detail, all_tags_detail = self.get_model_branches_and_tags_details(
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/modelscope/hub/api.py", line 574, in get_model_branches_and_tags_details
    handle_http_response(r, logger, cookies, model_id)
  File "/home/huiyu/.conda/envs/llava/lib/python3.10/site-packages/modelscope/hub/errors.py", line 98, in handle_http_response
    raise HTTPError('Response details: %s, Request id: %s' %
requests.exceptions.HTTPError: Response details: {'Code': 10010205001, 'Message': '获取模型版本失败，信息：record not found', 'RequestId': 'e8ad3691-6698-4bbc-aa14-823212d6dcfe', 'Success': False}, Request id: 78d60112e7f54592839c6c94e8f560c0

@rexainn 麻烦把推理代码也贴一下，我们看看问题

以下是我的推理代码：

import torch, auto_gptq
from modelscope import snapshot_download, AutoModel, AutoTokenizer
from auto_gptq.modeling._base import BaseGPTQForCausalLM
import json

heatmap = []
results = []
top = 5
result_file = "/DATA/DATA1/renxiaoyu/202403_mllm/results/asd/internlm-xcomposer_" + str(top) + ".json" 
with open('/DATA/DATA1/renxiaoyu/202403_mllm/data/heatmap_asd_list.txt', 'r', encoding='utf-8') as f:
    for ann in f.readlines():
        ann = ann.strip('\n')       #去除文本中的换行符
        heatmap.append(ann)

auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"]
torch.set_grad_enabled(False)

class InternLMXComposer2QForCausalLM(BaseGPTQForCausalLM):
    layers_block_name = "model.layers"
    outside_layer_modules = [
        'vit', 'vision_proj', 'model.tok_embeddings', 'model.norm', 'output', 
    ]
    inside_layer_modules = [
        ["attention.wqkv.linear"],
        ["attention.wo.linear"],
        ["feed_forward.w1.linear", "feed_forward.w3.linear"],
        ["feed_forward.w2.linear"],
    ]

# init model and tokenizer
model_dir = snapshot_download('Shanghai_AI_Laboratory/internlm-xcomposer2-vl-7b-4bit')
model = InternLMXComposer2QForCausalLM.from_quantized(model_dir, trust_remote_code=True, device="cuda:0").eval()
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

text = '<ImageHere> You are a classifier, and the two classes are autistic and normal. Based on this saliency map, which is generated through real eyemovements, you should judge whether the gaze there is more like an autistic person or more like a normal person. The output format is like a dict:{class: autistic or normal, reasons: why}.'

with torch.cuda.amp.autocast():
    for idx in range(top):
        image = heatmap[idx]
        response, _ = model.chat(tokenizer, query=text, image=image, history=[], do_sample=False)
        print(response)

InternLM / InternLM-XComposer

4bit版本使用modelscope拉取，inference时一直显示获取模型版本失败 #192