Open K0nkere opened 11 months ago
import os
import argparse
from langchain.document_loaders.git import GitLoader
from git import Repo
from langchain.text_splitter import MarkdownTextSplitter
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
def main(repo_path, repo_url, persist_directory):
if not os.path.exists(repo_path):
repo = Repo.clone_from(repo_url, to_path=repo_path)
else:
repo = Repo(repo_path)
branch = repo.head.reference
loader = GitLoader(repo_path=repo_path, branch='master', file_filter=lambda f: f.endswith(".md"))
data = loader.load()
text_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
# vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory=persist_directory)
vectorstore = Chroma.from_documents(
documents=all_splits,
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-distilroberta-v1"),
persist_directory=persist_directory
)
vectorstore.persist()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parameterized script for processing markdown files from a Github repo and storing OpenAI embeddings in a chromadb.")
parser.add_argument("--repo_path", default="/home/kkr/efko/projects/9_llm_with_langchain/data/label-studio-repo/", help="Path to the repository.")
parser.add_argument("--repo_url", default="https://github.com/HumanSignal/label-studio", help="URL of the repository to clone.")
parser.add_argument("--persist_directory", default="pd", help="Directory to persist the embeddings.")
args = parser.parse_args()
main(args.repo_path, args.repo_url, args.persist_directory)
Бэк на основе класса - генерация ответа по promt + отправка promt+response в LabelStudio
import label_studio_sdk as ls
from langchain.callbacks.base import BaseCallbackHandler
import json
import re
class LabelStudioCallbackHandler(BaseCallbackHandler):
def __init__(self, api_key, url, project_id):
self.ls_client = ls.Client(url=url, api_key=api_key)
self.ls_project = self.ls_client.get_project(project_id)
self.prompts = {}
def on_llm_start(self, serialized, prompts, **kwargs):
self.prompts[str(kwargs["parent_run_id"] or kwargs["run_id"])] = prompts
def on_llm_end(self, response, **kwargs):
run_id = str(kwargs["parent_run_id"] or kwargs["run_id"])
prompts = self.prompts[run_id]
tasks = []
for prompt, generation in zip(prompts, response.generations):
match = re.search(r'Human: (\[.*?\])', prompt)
if match:
json_string = match.group(1)
data = json.loads(json_string.replace('\'', '\"')) # replace single quotes with double quotes for valid JSON
print(data)
# Extract the 'content' field from the first dictionary in the list
content = data[0]["content"]
tasks.append({'prompt': content, 'response': generation[0].text})
self.ls_project.import_tasks(tasks)
self.prompts.pop(run_id)
Запуск интерфейса чатбота, обращение к модели для формирования эмбеддингов Необходимо импортировать переменные окружающей среды export LABEL_STUDIO_API_KEY=<...> - из интерфейса LabelStudio export OPENAI_API_KEY=<...> - ключ для модели
import os
import openai
import gradio as gr
import argparse
import logging
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQA
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chat_models import ChatOpenAI
import label_studio_sdk as ls
from label_studio_callback_handler import LabelStudioCallbackHandler
# Logging setup
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%d-%b-%y %H:%M:%S')
logger = logging.getLogger(__name__)
def main(api_key, ls_url, project_id, persist_dir):
ls_callback = LabelStudioCallbackHandler(
api_key=api_key,
url=ls_url,
project_id=project_id
)
vectorstore = Chroma(persist_directory=persist_dir, embedding_function=HuggingFaceEmbeddings()) # OpenAIEmbeddings())
qa_chain_with_labelstudio = RetrievalQA.from_chain_type(
llm=ChatOpenAI(
model_name="gpt-3.5-turbo",
temperature=0,
max_tokens=1000,
callbacks=[ls_callback],
),
chain_type="stuff",
retriever=vectorstore.as_retriever(),
return_source_documents=True
)
def predict(message, history):
history_openai_format = []
history_openai_format.append({"role": "user", "content": message})
response = qa_chain_with_labelstudio({"query": str(history_openai_format)})
return response['result']
gr.ChatInterface(predict).queue().launch(debug=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parameterized script for chat interface.")
parser.add_argument("--api_key", default=os.environ.get('LABEL_STUDIO_API_KEY', 'fallback_value'), help="Label Studio API key.")
parser.add_argument("--ls_url", default="http://0.0.0.0:8080", help="Label Studio URL.")
parser.add_argument("--project_id", type=int, help="Label Studio project ID.")
parser.add_argument("--persist_dir", default="pd", help="Persist directory for vectorstore.")
args = parser.parse_args()
main(args.api_key, args.ls_url, args.project_id, args.persist_dir)
Подгрузка размеченных верных вариантов в дополнение к базовым эмбеддингам
import json
import argparse
from langchain.docstore.document import Document
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
def extract_good_examples_from_jsonl(jsonl_path):
"""Extract examples labeled "Good" from a JSONL dataset exported from Label Studio.
Args:
jsonl_path (str): Path to the JSONL dataset. Note: Json files from label studio are really JSONL.
Returns:
list: List of examples as [{'prompt':'', 'response':'',...}]
"""
with open(jsonl_path, 'r') as f:
data = json.load(f)
return [
record['data'] for record in data
if any(
result['value']['choices'][0] == "Good"
for annotation in record['annotations']
for result in annotation['result']
)
]
def main(json_path, persist_directory):
"""Main function to extract and store good examples in vectordb."""
# Initialize the vector store with a given directory and embedding function
vectorstore = Chroma(
persist_directory=persist_directory,
embedding_function=HuggingFaceEmbeddings(
# model_name="sentence-transformers/all-MiniLM-L6-v2"
model_name="sentence-transformers/all-distilroberta-v1"
) # OpenAIEmbeddings()
)
# Extract "good" examples and convert them to Documents
good_examples = extract_good_examples_from_jsonl(json_path)
docs = [Document(page_content=str(example), metadata={"source": json_path}) for example in good_examples]
# Add documents to the vector store and persist
vectorstore.add_documents(docs)
vectorstore.persist()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract 'good' labeled examples from a JSONL file and add to vectordb.")
parser.add_argument("--json_path", required=True, help="Path to the JSONL file with labeled examples.")
parser.add_argument("--persist_dir", required=True, help="Directory where vectordb embeddings are stored.")
args = parser.parse_args()
main(args.json_path, args.persist_dir)
curl -X GET http://0.0.0.0:8080/api/projects/1/export?exportType=JSON -H 'Authorization: Token 77a1656801476968548f4402fe54fc7e847e9400' --output 'annotations.json'
curl -X GET http://localhost:8085/api/storages/export/types -H 'Authorization: Token 77a1656801476968548f4402fe54fc7e847e9400'
curl -X GET https://localhost:8080/api/projects/ -H 'Authorization: Token 77a1656801476968548f4402fe54fc7e847e9400'
curl -X GET http://0.0.0.0:8080/api/projects/1/tasks/ -H 'Authorization: Token 77a1656801476968548f4402fe54fc7e847e9400'
curl -H 'Content-Type: application/json' -H 'Authorization: Token 77a1656801476968548f4402fe54fc7e847e9400' \
-X POST 'http://localhost:8080/api/projects/1/import' --data '
{
"id": 18,
"data": {
"prompt": "What is LangFlow?",
"response": "LabelStudio is..."
},
"annotations": [
{
"id": 6,
"created_username": " kkr@efko.ru, 1",
"created_ago": "1 hour, 10 minutes",
"completed_by": {
"id": 1,
"first_name": "",
"last_name": "",
"avatar": null,
"email": "kkr@efko.ru",
"initials": "kk"
},
"result": [
{
"value": {
"choices": [
"Good"
]
},
"id": "DtTq-aYToD",
"from_name": "rating",
"to_name": "response",
"type": "choices",
"origin": "manual"
}
],
"was_cancelled": false,
"ground_truth": false,
"created_at": "2023-08-31T14:23:05.601957Z",
"updated_at": "2023-08-31T14:23:05.601989Z",
"lead_time": 3.025,
"task": 6,
"project": 1,
"parent_prediction": null,
"parent_annotation": null
}
],
"predictions": []
}'
import requests
import json
url = 'http://0.0.0.0:8080/api/projects/1/tasks/'
header = {'Content-Type': 'application/json', 'Authorization': 'Token 77a1656801476968548f4402fe54fc7e847e9400'}
record = {
"id": 18,
"data": {
"prompt": "Who is LangFlow?",
"response": "..."
},
"annotations": [
{
"id": 6,
"created_username": " kkr@efko.ru, 1",
"created_ago": "1 hour, 10 minutes",
"completed_by": {
"id": 1,
"first_name": "",
"last_name": "",
"avatar": "null",
"email": "kkr@efko.ru",
"initials": "kk"
},
"result": [
{
"value": {
"choices": [
"Good"
]
},
"id": "DtTq-aYToD",
"from_name": "rating",
"to_name": "response",
"type": "choices",
"origin": "manual"
}
],
"was_cancelled": "false",
"ground_truth": "false",
"created_at": "2023-08-31T14:23:05.601957Z",
"updated_at": "2023-08-31T14:23:05.601989Z",
"lead_time": 3.025,
"task": 6,
"project": 1,
"parent_prediction": "null",
"parent_annotation": "null"
}
],
"predictions": []
}
response = requests.post(
url=url,
headers=header,
data=json.dumps(record)
).json()
Example
Installing requirements.txt
Installing ChromaDB
On ERROR:
ERROR: Could not build wheels for chroma-hnswlib, which is required to install pyproject.toml-based projects
Запуск LabelStudio
Локально
Докер образ
Создание проекта
Логин в UI localhost:8080 > Create Project Выбрать шаблон интерфейса размеки из готовых или создать свой Custom