[Issue]: how to deploy the local model correctly and run the `agentchat_custom_model.ipynb`?

lambda7xx commented 1 month ago

Describe the issue

I use the Local-LLMs/ to deploy my local model but the result by llm is weird

Steps to reproduce

lunch the local model

In terminate 1: my command is 'python -m fastchat.serve.controller ' In terminate 2: command is python -m fastchat.serve.model_worker --model-path chatglm2-6b In terminate 3: command is python -m fastchat.serve.openai_api_server --host localhost --port 8000

code

My script is

from types import SimpleNamespace

from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

import autogen
from autogen import AssistantAgent, UserProxyAgent

# custom client with custom model loader

class CustomModelClient:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(config["model"], trust_remote_code=True).to(self.device)
        self.model_name = config["model"]
        self.tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
        #yself.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        # params are set by the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            # can create my own data response class
            # here using SimpleNamespace for simplicity
            # as long as it adheres to the ClientResponseProtocol

            response = SimpleNamespace()

            inputs = self.tokenizer.apply_chat_template(
                params["messages"], return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
        return [choice.message.content for choice in choices]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}

config_list_custom = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={"model_client_cls": ["CustomModelClient"]},
)

assistant = AssistantAgent("assistant", llm_config={"config_list": config_list_custom})
user_proxy = UserProxyAgent(
    "user_proxy",
    code_execution_config={
        "work_dir": "coding",
        "use_docker": False,  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
    },
)

assistant.register_model_client(model_client_cls=CustomModelClient)

user_proxy.initiate_chat(assistant, message="Write python code to print Hello World!")

# custom client with custom model loader

# class CustomModelClientWithArguments(CustomModelClient):
#     def __init__(self, config, loaded_model, tokenizer, **kwargs):
#         print(f"CustomModelClientWithArguments config: {config}")

#         self.model_name = config["model"]
#         self.model = loaded_model
#         self.tokenizer = tokenizer

#         self.device = config.get("device", "cpu")

#         gen_config_params = config.get("params", {})
#         self.max_length = gen_config_params.get("max_length", 256)
#         print(f"Loaded model {config['model']} to {self.device}")

# # load model here

# config = config_list_custom[0]
# device = config.get("device", "cpu")
# loaded_model = AutoModelForCausalLM.from_pretrained(config["model"]).to(device)
# tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
# tokenizer.pad_token_id = tokenizer.eos_token_id

# config_list_custom = autogen.config_list_from_json(
#     "OAI_CONFIG_LIST",
#     filter_dict={"model_client_cls": ["CustomModelClientWithArguments"]},
# )

# assistant = AssistantAgent("assistant", llm_config={"config_list": config_list_custom})

# assistant.register_model_client(
#     model_client_cls=CustomModelClientWithArguments,
#     loaded_model=loaded_model,
#     tokenizer=tokenizer,
# )

# user_proxy.initiate_chat(assistant, message="Write python code to print Hello World!")

My OAI_CONFIG_LIST is

[
    {
        "model": "chatglm2-6b",
        "base_url": "http://localhost:8000/v1",
        "api_type": "openai",
        "api_key": "NULL",
        "model_client_cls": "CustomModelClient",
        "device": "cuda"
    }
]

Screenshots and logs

my log is

oaded model chatglm2-6b to cuda
user_proxy (to assistant):

Write python code to print Hello World!

--------------------------------------------------------------------------------
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
assistant (to user_proxy):

"A_S    A_Savor.
.

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
user_proxy (to assistant):

--------------------------------------------------------------------------------
assistant (to user_proxy):

A_S     A_Savor.
.

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
user_proxy (to assistant):

--------------------------------------------------------------------------------
assistant (to user_proxy):

<=D                                                                                                                                                                                             >|

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: Write python code to print Hello World!
user_proxy (to assistant):

Write python code to print Hello World!

--------------------------------------------------------------------------------
assistant (to user_proxy):

=_=_=_=_=

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: I need code
user_proxy (to assistant):

I need code

--------------------------------------------------------------------------------
assistant (to user_proxy):

=                                                                               ~

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: python code
user_proxy (to assistant):

python code

--------------------------------------------------------------------------------
assistant (to user_proxy):

===

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation:

I think the assistant agent should write a python code but it does not.

Additional Information

No response

marklysze commented 1 month ago

Hey @lambda7xx, if you're able to run models using Ollama, I'd recommend trying the code in PR #3056, as it's a custom client class specifically for running local models in Ollama.

lambda7xx commented 1 month ago

3056

thanks. let me try it

lambda7xx commented 1 month ago

@marklysze hi, here are my steps 1 terminate 1: ollama run llama3.1:70b 2 terminatr 2: my code

# THIS TESTS: TWO AGENTS WITH TERMINATION

altmodel_llm_config = {
    "config_list":
    [
        {
            "api_type": "ollama",
            "model": "llama3.1:70b",
            "client_host": "http://192.168.0.1:11434",
            "seed": 42,
             "api_key": "NULL",
        }
    ]
}

from autogen import ConversableAgent

jack = ConversableAgent(
    "Jack",
    llm_config=altmodel_llm_config,
    system_message="Your name is Jack and you are a comedian in a two-person comedy show.",
    is_termination_msg=lambda x: True if "FINISH" in x["content"] else False
)
emma = ConversableAgent(
    "Emma",
    llm_config=altmodel_llm_config,
    system_message="Your name is Emma and you are a comedian in two-person comedy show. Say the word FINISH ONLY AFTER you've heard 2 of Jack's jokes.",
    is_termination_msg=lambda x: True if "FINISH" in x["content"] else False
)

chat_result = jack.initiate_chat(emma, message="Emma, tell me a joke about goldfish and peanut butter.", max_turns=10)

my log is

Traceback (most recent call last):
  File "/home/xiao/autogen/ollama_use.py", line 17, in <module>
    jack = ConversableAgent(
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 160, in __init__
    self._validate_llm_config(llm_config)
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 264, in _validate_llm_config
    self.client = None if self.llm_config is False else OpenAIWrapper(**self.llm_config)
  File "/home/xiao/autogen/autogen/oai/client.py", line 422, in __init__
    self._register_default_client(config, openai_config)  # could modify the config
  File "/home/xiao/autogen/autogen/oai/client.py", line 523, in _register_default_client
    client = OpenAI(**openai_config)
  File "/home/xiao/anaconda3/envs/autogen/lib/python3.10/site-packages/openai/_client.py", line 105, in __init__
    raise OpenAIError(
openai.OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
(autogen) xiao@ptc:~/autogen$ python3 ollama_use.py 
[autogen.oai.client: 08-01 07:20:37] {164} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.
[autogen.oai.client: 08-01 07:20:37] {164} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.
Jack (to Emma):

Emma, tell me a joke about goldfish and peanut butter.

--------------------------------------------------------------------------------

>>>>>>>> USING AUTO REPLY...
Traceback (most recent call last):
  File "/home/xiao/autogen/ollama_use.py", line 31, in <module>
    chat_result = jack.initiate_chat(emma, message="Emma, tell me a joke about goldfish and peanut butter.", max_turns=10)
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 1012, in initiate_chat
    self.send(msg2send, recipient, request_reply=True, silent=silent)
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 656, in send
    recipient.receive(message, self, request_reply, silent)
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 819, in receive
    reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 1973, in generate_reply
    final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple["config"])
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 1341, in generate_oai_reply
    extracted_response = self._generate_oai_reply_from_client(
  File "/home/xiao/autogen/autogen/agentchat/conversable_agent.py", line 1360, in _generate_oai_reply_from_client
    response = llm_client.create(
  File "/home/xiao/autogen/autogen/oai/client.py", line 732, in create
    response = client.create(params)
  File "/home/xiao/autogen/autogen/oai/client.py", line 320, in create
    response = completions.create(**params)
  File "/home/xiao/anaconda3/envs/autogen/lib/python3.10/site-packages/openai/_utils/_utils.py", line 277, in wrapper
    return func(*args, **kwargs)
TypeError: Completions.create() got an unexpected keyword argument 'client_host'

marklysze commented 1 month ago

Hey @lambda7xx, it doesn't look like the Ollama code files are being used, the code files in the branch need to replace your installed ones, see the files here: https://github.com/microsoft/autogen/pull/3056/files

You just need to find your pyautogen install and replace the content of these 5 files with the ones from that link above (note that ollama.py is a new file):

lambda7xx commented 1 month ago

Hey @lambda7xx, it doesn't look like the Ollama code files are being used, the code files in the branch need to replace your installed ones, see the files here: https://github.com/microsoft/autogen/pull/3056/files

You just need to find your pyautogen install and replace the content of these 5 files with the ones from that link above (note that ollama.py is a new file):

thank you so much.

microsoft / autogen