[Feature]: support for baichuan-7b

baichuan-7B is an open-source large-scale pre-trained model developed by Baichuan Intelligent Technology. Based on the Transformer architecture, it is a model with 7 billion parameters trained on approximately 1.2 trillion tokens. It supports both Chinese and English, with a context window length of 4096. It achieves the best performance of its size on standard Chinese and English authoritative benchmarks (C-EVAL/MMLU).

Here is a link to its model card on HuggingFace.

Would you mind putting this on the roadmap for support? Additionally, support guanaco-33b-merged is broken ~~as fuck~~. That is, we get something like this from llmserver.py when we do so:

playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.
2023-06-15 18:21:57,013 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2023-06-15 18:21:57,014 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-15 18:21:57,015 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2023-06-15 18:21:57,015 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-15 18:21:57,016 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2023-06-15 18:21:57,016 INFO sqlalchemy.engine.Engine [raw sql] {}
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /root/autodl-tmp/db-gpt-2/pilot/server/llmserver.py:108 in <module>          │
│                                                                              │
│   105                                                                        │
│   106                                                                        │
│   107 model_path = LLM_MODEL_CONFIG[CFG.LLM_MODEL]                           │
│ ❱ 108 worker = ModelWorker(                                                  │
│   109 │   model_path=model_path, model_name=CFG.LLM_MODEL, device=DEVICE, nu │
│   110 )                                                                      │
│   111                                                                        │
│                                                                              │
│ /root/autodl-tmp/db-gpt-2/pilot/server/llmserver.py:38 in __init__           │
│                                                                              │
│    35 │   │   self.device = device                                           │
│    36 │   │                                                                  │
│    37 │   │   self.ml = ModelLoader(model_path=model_path)                   │
│ ❱  38 │   │   self.model, self.tokenizer = self.ml.loader(                   │
│    39 │   │   │   num_gpus, load_8bit=ISLOAD_8BIT, debug=ISDEBUG             │
│    40 │   │   )                                                              │
│    41                                                                        │
│                                                                              │
│ /root/autodl-tmp/db-gpt-2/pilot/model/loader.py:101 in loader                │
│                                                                              │
│    98 │   │   # TODO when cpu loading,  need use quantization config         │
│    99 │   │                                                                  │
│   100 │   │   llm_adapter = get_llm_model_adapter(self.model_path)           │
│ ❱ 101 │   │   model, tokenizer = llm_adapter.loader(self.model_path, kwargs) │
│   102 │   │                                                                  │
│   103 │   │   if load_8bit and tokenizer:                                    │
│   104 │   │   │   if num_gpus != 1:                                          │
│                                                                              │
│ /root/autodl-tmp/db-gpt-2/pilot/model/adapter.py:110 in loader               │
│                                                                              │
│   107 │                                                                      │
│   108 │   def loader(self, model_path: str, from_pretrained_kwargs: dict):   │
│   109 │   │   tokenizer = LlamaTokenizer.from_pretrained(model_path)         │
│ ❱ 110 │   │   model = AutoModelForCausalLM.from_pretrained(                  │
│   111 │   │   │   model_path, load_in_4bit=True, **from_pretrained_kwargs    │
│   112 │   │   )                                                              │
│   113 │   │   return model, tokenizer                                        │
│                                                                              │
│ /root/miniconda3/envs/dbgpt_env/lib/python3.10/site-packages/transformers/mo │
│ dels/auto/auto_factory.py:471 in from_pretrained                             │
│                                                                              │
│   468 │   │   │   )                                                          │
│   469 │   │   elif type(config) in cls._model_mapping.keys():                │
│   470 │   │   │   model_class = _get_model_class(config, cls._model_mapping) │
│ ❱ 471 │   │   │   return model_class.from_pretrained(                        │
│   472 │   │   │   │   pretrained_model_name_or_path, *model_args, config=con │
│   473 │   │   │   )                                                          │
│   474 │   │   raise ValueError(                                              │
│                                                                              │
│ /root/miniconda3/envs/dbgpt_env/lib/python3.10/site-packages/transformers/mo │
│ deling_utils.py:2629 in from_pretrained                                      │
│                                                                              │
│   2626 │   │   │   init_contexts.append(init_empty_weights())                │
│   2627 │   │                                                                 │
│   2628 │   │   with ContextManagers(init_contexts):                          │
│ ❱ 2629 │   │   │   model = cls(config, *model_args, **model_kwargs)          │
│   2630 │   │                                                                 │
│   2631 │   │   # Check first if we are `from_pt`                             │
│   2632 │   │   if use_keep_in_fp32_modules:                                  │
╰──────────────────────────────────────────────────────────────────────────────╯
TypeError: LlamaForCausalLM.__init__() got an unexpected keyword argument 
'load_in_4bit'

here's our system configuration as provided by conda info

              platform : linux-64
             user-agent : conda/4.10.3 requests/2.25.1 CPython/3.8.10 Linux/5.4.0-100-generic ubuntu/20.04.4 glibc/2.31
                UID:GID : 0:0
             netrc file : None
           offline mode : False

and here's our GPU:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.57       Driver Version: 515.57       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  On   | 00000000:D5:00.0 Off |                  N/A |
| 45%   22C    P8    25W / 350W |      2MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Any effort is greatly appreciated ~~as it prevents my ass from getting wet.~~

In our DB-GPT framework, it is very simple to access the new model, and the specific access can be written according to the following tutorial.

Implement a ModelAdapter according to template. Add code at pilot/model/adapter.py, next is a demo of ChatGLM


class ChatGLMAdapater(BaseLLMAdaper):
"""LLM Adatpter for THUDM/chatglm-6b"""

def match(self, model_path: str):
    return "chatglm" in model_path

def loader(self, model_path: str, from_pretrained_kwargs: dict):
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    if DEVICE != "cuda":
        model = AutoModel.from_pretrained(
            model_path, trust_remote_code=True, **from_pretrained_kwargs
        ).float()
        return model, tokenizer
    else:
        model = (
            AutoModel.from_pretrained(
                model_path, trust_remote_code=True, **from_pretrained_kwargs
            )
            .half()
            .cuda()
        )
        return model, tokenizer

2. Implement a ChatAdapter at pilot/server/chat_adapter.py
```python
class ChatGLMChatAdapter(BaseChatAdpter):
    """Model chat Adapter for ChatGLM"""

    def match(self, model_path: str):
        return "chatglm" in model_path

    def get_generate_stream_func(self):
        from pilot.model.llm_out.chatglm_llm import chatglm_generate_stream

        return chatglm_generate_stream

Implement model generate function. pilot/model/llm_out/chatglm_llm.py


@torch.inference_mode()
def chatglm_generate_stream(
model, tokenizer, params, device, context_len=2048, stream_interval=2
):
"""Generate text using chatglm model's chat api"""
prompt = params["prompt"]
temperature = float(params.get("temperature", 1.0))
top_p = float(params.get("top_p", 1.0))
stop = params.get("stop", "###")
echo = params.get("echo", False)

generate_kwargs = {
    "do_sample": True if temperature > 1e-5 else False,
    "top_p": top_p,
    "repetition_penalty": 1.0,
    "logits_processor": None,
}

if temperature > 1e-5:
    generate_kwargs["temperature"] = temperature

# TODO, Fix this
print(prompt)
messages = prompt.split(stop)
#
# # Add history conversation
hist = []
once_conversation = []
for message in messages[:-2]:
    if len(message) <= 0:
        continue

    if "human:" in message:
        once_conversation.append(message.split("human:")[1])
    # elif "system:" in message:
    #     once_conversation.append(f"""###system:{message.split("system:")[1]} """)
    elif "ai:" in message:
        once_conversation.append(message.split("ai:")[1])
        last_conversation = copy.deepcopy(once_conversation)
        hist.append(last_conversation)
        once_conversation = []
    # else:
    #     once_conversation.append(f"""###system:{message} """)

try:
    query = messages[-2].split("human:")[1]
except IndexError:
    query = messages[-3].split("human:")[1]
print("Query Message: ", query)
# output = ""
# i = 0

for i, (response, new_hist) in enumerate(
    model.stream_chat(tokenizer, query, hist, **generate_kwargs)
):
    if echo:
        output = query + " " + response
    else:
        output = response

    yield output

yield output



just do this, you can use LLM whatever you want. Please try this， we really appreciate your contributions.

eosphoros-ai / DB-GPT

[Feature]: support for baichuan-7b #228