Open eunja511005 opened 7 months ago
# %%
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://js.langchain.com/docs/modules/model_io/concepts")
data = loader.load()
data
# %%
!pip install lxml
# %% [markdown]
# ### 좀 더 정재된 데이터를 가져 오기 위해 HTMLHeaderTextSplitter 사용
# - https://python.langchain.com/docs/modules/data_connection/document_transformers/HTML_header_metadata/
# %%
from langchain_text_splitters import HTMLHeaderTextSplitter
html_string = """
<!DOCTYPE html>
<html>
<body>
<div>
<h1>Foo</h1>
<p>Some intro text about Foo.</p>
<div>
<h2>Bar main section</h2>
<p>Some intro text about Bar.</p>
<h3>Bar subsection 1</h3>
<p>Some text about the first subtopic of Bar.</p>
<h3>Bar subsection 2</h3>
<p>Some text about the second subtopic of Bar.</p>
</div>
<div>
<h2>Baz</h2>
<p>Some text about Baz</p>
</div>
<br>
<p>Some concluding text about Foo</p>
</div>
</body>
</html>
"""
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits
# %%
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter
# url = "https://js.langchain.com/docs/modules/model_io/concepts"
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3")
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
# html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits = html_splitter.split_text_from_file("Concepts_Langchain.html")
print(html_header_splits)
chunk_size = 1000
chunk_overlap = 60
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(html_header_splits)
print(len(splits))
splits[0:5]
# %% [markdown]
# ### embedding, vertorstore, retriever 까지 생성
# - https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore/
# %%
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
embeddings = OllamaEmbeddings(model="gemma:2b")
db = Chroma.from_documents(splits, embeddings)
query = "What is the difference of LLMs and Chat Models?"
docs = db.similarity_search(query)
print(docs)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})
docs = retriever.invoke(query)
print(docs)
# %% [markdown]
# ### RAG 체인 생성
# - https://python.langchain.com/docs/use_cases/question_answering/quickstart/
# %% [markdown]
# ### Vector Store 참조
# - https://python.langchain.com/docs/modules/data_connection/vectorstores/
#
# ### Vector Store DB 저장
# - https://python.langchain.com/docs/integrations/vectorstores/chroma/
# %%
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
llm = Ollama(model="gemma:2b")
# prompt = hub.pull("rlm/rag-prompt")
prompt = ChatPromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:""")
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
rag_chain.invoke({"question": query})
# %%
# save to disk
db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
docs = db2.similarity_search(query)
# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
docs = db3.similarity_search(query)
print(docs[0].page_content)
# %%
import os
from langchain_chroma import Chroma
# Function to check if the directory exists and contains files
def directory_exists_and_not_empty(path):
return os.path.exists(path) and os.path.isdir(path) and len(os.listdir(path)) > 0
# Path to the directory where the Chroma DB will be stored
persist_directory = "./chroma_db"
# Check if the directory exists and is not empty
if directory_exists_and_not_empty(persist_directory):
# Load database from existing directory
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
else:
# Create a new database from documents since directory does not exist or is empty
db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)
# Perform a similarity search
search_results = db.similarity_search(query)
# Print the page content of the first document in the search results
if search_results:
print(search_results[0].page_content)
else:
print("No documents found.")
# %% [markdown]
# ### 도전 과제
# - https://github.com/ollama/ollama/tree/main/examples/langchain-python-rag-privategpt
import os
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3")
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
# html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits = html_splitter.split_text_from_file("Concepts_Langchain.html")
chunk_size = 1000
chunk_overlap = 60
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(html_header_splits)
model_name = "llama2"
embeddings = OllamaEmbeddings(model=model_name)
def directory_exists_and_not_empty(path):
return os.path.exists(path) and os.path.isdir(path) and len(os.listdir(path)) > 0
persist_directory = "./chroma_db"
if directory_exists_and_not_empty(persist_directory):
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
else:
db = Chroma.from_documents(splits, embeddings, persist_directory=persist_directory)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
llm = Ollama(model=model_name)
# prompt = hub.pull("rlm/rag-prompt")
prompt = ChatPromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:""")
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
query = "What is the difference of LLMs and Chat Models?"
rag_chain.invoke({"question": query})
작업 개요
ChatGPT 질문 내용
[주의 사항]
Okay, let's implement RAG using Python. Follow the steps below to implement the Python program.
[Notes]
Translated with DeepL.com (free version)
좋아. 그럼 RAG를 파이썬을 이용하여 구현해 보자. 파이썬 프로그램 구현시 아래 단계로 작업 수행해줘.
[주의 사항]
RAG를 파이썬을 이용하여 구현해 보자. 파이썬 프로그램 구현시 아래 단계로 작업 수행해줘.
[주의 사항]