Open dhuynh95 opened 5 months ago
@catyung is exploring this currently. Here are some exchanges on the Discord :
I'm working on this. VLM : Phi-3-Vision LLM : Codestral Embedding : BGE-M3 Bascially, still not able to run through the agent.run() , run it separately and edit some prompt is necessary I tried to use ONNX version... but no luck, as i am using Colab, the cuda version is 12.X and I am having some problem on this when i load onnxruntime-genai, saw someone having the same issue with me, hopefully the team will propose solution for that. But at this moment, I dont see any better way to do it. From the world model, I think the output make sense, but I just have to use Stopping Criteria to stop generation by "-----", later on I also remove the "-----" I think the Navigation Engine works as well : [{'query':'button”Quick Tour”', 'action':'Click on the button “Quick Tour”'}]
Here is the code that I use for using fully open-source models as world model, action engine, embedding model
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
# quantize to save memory
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
model_id = "mistralai/Codestral-22B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(
model_id,
)
from transformers import StoppingCriteria, StoppingCriteriaList
stop_list = ['```', "/n```", "```\n"]
device = "cuda"
# Tokenize the stop sequences and move them to the appropriate device
stop_token_ids = [tokenizer(x, return_tensors='pt', add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
# Define the custom stopping criteria class
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_ids in stop_token_ids:
#print(f"Testing {input_ids[0][-len(stop_ids[0])+1:]} against {stop_ids[0][1:]}")
if input_ids[0, -len(stop_ids[0]):].equal(stop_ids[0]):
return True
return False
# stopping_criteria = StoppingCriteriaList([StopOnTokens()])
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
llm = HuggingFaceLLM(
model_name= model_id,
tokenizer_name= model_id,
max_new_tokens=1000,
model_kwargs={"quantization_config": quantization_config},
generate_kwargs={"temperature": 0.0, "stopping_criteria": stopping_criteria},
device_map="auto",
stopping_ids = [tokenizer.eos_token_id]
)
#### LOAD VLM MODELS ####
import os
import io
import torch
from PIL import Image
import base64
import requests
from typing import List, Sequence
from transformers import AutoProcessor,BitsAndBytesConfig, AutoModelForCausalLM,TextStreamer,StoppingCriteria, StoppingCriteriaList, LogitsProcessorList
from llama_index.core.schema import ImageDocument
class VLM_ResponseWrapper:
def __init__(self, text):
self.text = text
class HF_VLM():
def __init__(self, model="microsoft/Phi-3-vision-128k-instruct"):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_type=torch.bfloat16
)
self.vlm = AutoModelForCausalLM.from_pretrained(model, device_map="cuda",
quantization_config=bnb_config,
trust_remote_code=True,
f torch_dtype="auto")
self.processor = AutoProcessor.from_pretrained(model, trust_remote_code=True)
def complete(self, query, image_documents=None):
### Set Stopping Criteria ###
stop_list = ['\n-----','-----','.\n-----']
device = "cuda"
# Tokenize the stop sequences and move them to the appropriate device
stop_token_ids = [self.processor.tokenizer(x, return_tensors='pt', add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
# Define the custom stopping criteria class
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_ids in stop_token_ids:
#print(f"Testing {input_ids[0][-len(stop_ids[0])+1:]} against {stop_ids[0][1:]}")
if input_ids[0][-len(stop_ids[0])+1:].equal(stop_ids[0][1:]):
return True
return False
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
generation_args = {
"temperature": 0.0,
"max_new_tokens": 512,
"repetition_penalty": 1.1,
}
# Init chat memeory
messages = []
images = []
if isinstance(image_documents, list):
for image_doc in image_documents:
if isinstance(image_doc, ImageDocument):
# If the document is an ImageDocument
if image_doc.image_path:
image = Image.open(image_doc.image_path)
elif image_doc.image_url:
response = requests.get(image_doc.image_url)
image = Image.open(io.BytesIO(response.content))
elif "file_path" in image_doc.metadata and image_doc.metadata["file_path"]:
image = Image.open(image_doc.metadata["file_path"])
else:
raise ValueError("ImageDocument does not contain a valid image source")
images.append(image)
else:
raise TypeError("Expected str (file path), bytes (byte stream), or ImageDocument, got {}".format(type(image_doc)))
messages.append({"role": "user", "content": f"<|image_1|>\n{query}"})
prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#print(prompt)
inputs = self.processor(prompt, [image], return_tensors="pt").to("cuda:0")
generate_ids = self.vlm.generate(**inputs,
stopping_criteria = stopping_criteria,
eos_token_id=self.processor.tokenizer.eos_token_id,
**generation_args,)
# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
#messages.append({"role": "assistant", "content": f"{response}"})
#print(messages)
return VLM_ResponseWrapper(response)
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True,
'precision': 'binary',
'batch_size': 1,
}
embedding = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
show_progress=True,
)
from lavague.core import WorldModel, ActionEngine
from lavague.core.agents import WebAgent
from lavague.core.context import Context
from lavague.drivers.selenium import SeleniumDriver
# Init HF_VLM
vlm = HF_VLM()
# Initialize context with our custom elements
context = Context(llm, vlm, embedding)
# Initialize the Selenium driver
selenium_driver = SeleniumDriver(headless=True)
world_model = WorldModel.from_context(context)
action_engine = ActionEngine.from_context(context, selenium_driver)
# Create your agent
agent = WebAgent(world_model, action_engine)
agent.get("https://huggingface.co/docs")
agent.run("Go on the quicktour of PEFT", display=True)
That's great! I will have a look. How good are performances?
As discussed previously, the world model needs to have strong reasoning capability in order to give "instruction" to the action engine.
The major problem is on Phi-3-vision as the world model, for sure, its not comparable to GPT-4o to understand complicated prompt instruction and reasoning, hence, I edited the prompt, which aims to make Phi-3-vision understand the task and reasoning.
The following testing is only focusing on the capability of the world model, not include action engine part (per my testing, the codestral model works quite well)
To run the testing, I just randomly screenshot my LinkedIn feed.
from PIL import Image
def world_model_infer(image,objective,url,action_result_log):
# Thought for next step by reading the descrption & current image
instruction_prompt = f"""You are an AI system specialized in high level reasoning. Your goal is to generate instructions for other specialized AIs to perform web actions to reach objectives given by humans.
Output format:
Thoughts: a list of thoughts in bullet points detailling your reasoning.
<<Example 1>>
Objective: Go on the quicktour of PEFT
URL : https://huggingface.co/docs
Thoughts:
- The current screenshot shows the documentation page on Huggingface.
- The objective is to go on the quicktour of PEFT.
- There is a section labeled 'PEFT' on the page.
- The next step is to click on the 'PEFT' section to explore futher.
Objective : {objective}
URL : {url}
Thoughts:"""
instruction_result = vqa_phi3(image, instruction_prompt)
# suggest the action
action_prompt = f"""You are now asked to generate the further next web action, you should consider the previous action to generate the next action.
Here is the instruction :
{instruction_result}
Previous Action that have been done :
{action_result_log[-1]}
Suggest the next ONE relevant web action to interact with, that would acheive the objective :{objective}.
Next Action :"""
action_result = vqa_phi3(image, action_prompt)
action_result_log.append(action_result)
# select the next engine
select_engine_prompt = f'''Pick your engine to finish the task described in :
{action_result}
Here is the Engine list that you can choose from :
- Python Engine: This engine is used when the task requires doing computing using the current state of the agent.
It does not impact the outside world and does not navigate.
- Navigation Engine: This engine is used when the next step of the task requires further navigation to reach the goal.
For instance it can be used to click on a link or to fill a form on a webpage. This engine is heavy and will do complex processing of the current HTML to decide which element to interact with.
- Navigation Controls: This engine is used to perform simple navigation. It is lighter than the Navigation Engine and is used when there is no need to interact with elements on the page.
Current controls are WAIT (to wait for a certain amount of time), BACK (to go back in the browser history), SCAN (to take screenshots of the whole page) and MAXIMIZE_WINDOW (to maximize the viewport of the driver).
Selected Engine:'''
next_engine_result = vqa_phi3(image, select_engine_prompt)
print("## Objective:", objective)
print("## Instruction:",instruction_result)
print("## Web Action:",action_result)
print("## Next engine:",next_engine_result)
Image is the home page of the Linkedin Feed
image1 = Image.open(f'/content/linkedin1.png')
action_result_log = ['']
objective = "Search for Machine Learning Related Information on LinkedIn"
url = "https://www.linkedin.com/feed/"
world_model_infer(image=image1,objective=objective,url=url,action_result_log=action_result_log)
image
## Objective: Search for Machine Learning Related Information on LinkedIn
## Instruction: Thoughts:
- The current screenshot shows the LinkedIn homepage.
- The objective is to search for machine learning related information on LinkedIn.
- There is a search bar at the top of the page.
- The next step is to enter keywords related to machine learning in the search bar and click on the search button.
-
## Web Action: Enter keywords related to machine learning in the search bar and click on the search button.
## Next engine: Navigation Engine
On the search page
image2 = Image.open(f'/content/linkedin2.png')
objective = "Search for Machine Learning Related Information on LinkedIn"
url = "https://www.linkedin.com/search/results/all/?keywords=machine%20learning&origin=GLOBAL_SEARCH_HEADER&sid=kV8"
world_model_infer(image=image2,objective=objective,url=url,action_result_log=action_result_log)
image
## Objective: Search for Machine Learning Related Information on LinkedIn
## Instruction: Objective: Search for Machine Learning Related Information on LinkedIn
URL: https://www.linkedin.com/search/results/all/?keywords=machine%20learning&origin=GLOBAL_SEARCH_HEADER&sid=kV8
Thoughts:
- The current screenshot shows the LinkedIn search page.
- The objective is to search for machine learning related information.
- The search query contains the keywords 'machine learning'.
-
## Web Action: Browse through the search results and click on the 'People asked' section to find articles and discussions related to machine learning.
## Next engine: Navigation Engine
In general, I think the simple prompt engineering can help Phi-3-vision better understand the task and generate response. This is just a simple testing and the result make sense, but I would highly recommend to test around with more use cases.
Cheers!
Hi @catyung , could you please share the requirements.txt file to resolve all the dependencies to get this code work on my local
Lots of people have asked to have a local version working that is not reliant on OpenAI.
So far OSS models have seemed to not be good enough but Phi-3 might make it.
It could be interesting if someone replace the
mm_llm
andllm
in both theWorldModel
andActionEngine
to see if it is performant enough.