`# imports
import ast # for converting embeddings saved as strings back to arrays
import openai # for calling the OpenAI API
import pandas as pd # for storing text and embeddings data
import tiktoken # for counting tokens
from scipy import spatial # for calculating vector similarities for search
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
the dataframe has two columns: "text" and "embedding"
df
search function
def strings_ranked_by_relatedness(
query: str,
df: pd.DataFrame,
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
top_n: int = 100
) -> tuple[list[str], list[float]]:
"""Returns a list of strings and relatednesses, sorted from most related to least."""
query_embedding_response = openai.Embedding.create(
model=EMBEDDING_MODEL,
input=query,
)
query_embedding = query_embedding_response["data"][0]["embedding"]
strings_and_relatednesses = [
(row["text"], relatedness_fn(query_embedding, row["embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
strings, relatednesses = zip(*strings_and_relatednesses)
return strings[:top_n], relatednesses[:top_n]
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
"""Return the number of tokens in a string."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def query_message(
query: str,
df: pd.DataFrame,
model: str,
token_budget: int
) -> str:
"""Return a message for GPT, with relevant source texts pulled from a dataframe."""
strings, relatednesses = strings_ranked_by_relatedness(query, df)
introduction = 'You are helpful AI assistant, If the answer cannot be found in your training data, Use the below articles to answer the subsequent question. "'
question = f"\n\nQuestion: {query}"
message = introduction
for string in strings:
next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
if (
num_tokens(message + next_article + question, model=model)
⚠️ 搜索是否存在类似issue
总结
用户可以结合自己的需求,搜集维基百科中的主题文章并转换成嵌入式文件embedding。用户在微信中提问时,机器人可以结合gpt-3.5-turbo和embedding两种策略回答。 由于我还是个菜鸟,根据OpenAI官方的cookbook,已经做好了嵌入式文件、搜索函数和提问函数,但是不知道如何如何跟咱们家仓库里的结合(没找到获取用户提问query的py文件),所以想请大神帮着看看。
`# imports import ast # for converting embeddings saved as strings back to arrays import openai # for calling the OpenAI API import pandas as pd # for storing text and embeddings data import tiktoken # for counting tokens from scipy import spatial # for calculating vector similarities for search import os openai.api_key = os.getenv("OPENAI_API_KEY")
models
EMBEDDING_MODEL = "text-embedding-ada-002" GPT_MODEL = "gpt-3.5-turbo"
download pre-chunked text and pre-computed embeddings
this file is ~200 MB, so may take a minute depending on your connection speed
embeddings_path = "E:\Fort\AIGC\OpenAI\Embedding_Wikipedia_Articles\FIFA_World_Cup_2022.csv"
df = pd.read_csv(embeddings_path)
convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)
the dataframe has two columns: "text" and "embedding"
df
search function
def strings_ranked_by_relatedness( query: str, df: pd.DataFrame, relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), top_n: int = 100 ) -> tuple[list[str], list[float]]: """Returns a list of strings and relatednesses, sorted from most related to least.""" query_embedding_response = openai.Embedding.create( model=EMBEDDING_MODEL, input=query, ) query_embedding = query_embedding_response["data"][0]["embedding"] strings_and_relatednesses = [ (row["text"], relatedness_fn(query_embedding, row["embedding"])) for i, row in df.iterrows() ] strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True) strings, relatednesses = zip(*strings_and_relatednesses) return strings[:top_n], relatednesses[:top_n]
def num_tokens(text: str, model: str = GPT_MODEL) -> int: """Return the number of tokens in a string.""" encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(text))
def query_message( query: str, df: pd.DataFrame, model: str, token_budget: int ) -> str: """Return a message for GPT, with relevant source texts pulled from a dataframe.""" strings, relatednesses = strings_ranked_by_relatedness(query, df) introduction = 'You are helpful AI assistant, If the answer cannot be found in your training data, Use the below articles to answer the subsequent question. "' question = f"\n\nQuestion: {query}" message = introduction for string in strings: next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""' if ( num_tokens(message + next_article + question, model=model)
def ask( query: str, df: pd.DataFrame = df, model: str = GPT_MODEL, token_budget: int = 4096 - 500, print_message: bool = True, ) -> str: """Answers a query using GPT and a dataframe of relevant texts and embeddings.""" message = query_message(query, df, model=model, token_budget=token_budget) if print_message: print(message) messages = [ {"role": "system", "content": "You are helpful, creative, clever, and very friendly AI assistant."}, {"role": "user", "content": message}, ] response = openai.ChatCompletion.create( model=model, messages=messages, temperature=0 ) response_message = response["choices"][0]["message"]["content"] return response_message
set print_message=True to see the source text GPT was working off of
print(ask('2022卡塔尔世界杯冠军'))`
举例
user:2022年卡塔尔世界杯冠军? assistant:The 2022 FIFA World Cup champion is Argentina.
动机
gpt-3.5-turbo的训练数据停留在2021年9月,所以微信机器人在响应用户消息时有一定时效限制。学习了OpenAI官方的资料,看到可以将嵌入式搜索插入prompt,拓展gpt的静态数据模型,扩大知识库,增强机器人的回复性能。所以想尝试给咱们家微信机器人迭代。