eunja511005 / AutoCoding

0 stars 0 forks source link

파이썬 참고 #180

Open ywbestPark opened 2 months ago

ywbestPark commented 2 months ago
import os

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

 

!pip install openai # openai 라이브러리를 설치합니다.
!pip install langchain # 랭체인 라이브러리를 설치합니다.
!pip install tqdm
!pip install chromadb # 벡터스토어
!pip install tiktoken # 토큰 계산용
!pip install sentence-transformers

import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt",
    filename="state_of_the_union.txt"
)
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

raw_documents = TextLoader('state_of_the_union.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())
documents
len(documents)
documents[0:4]
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)
embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)
len(embedding_vector)
embedding_vector
from tqdm import tqdm
class SimpleTextLoader:

    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        text = ''
        with open(self.file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
class SimpleCharacterTextSplitter:

    def __init__(self, chunk_size, chunk_overlap, separator_pattern='\n\n'):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separator_pattern = separator_pattern

    def split_documents(self, documents):

        splits = documents.split(self.separator_pattern)

        chunks = []
        current_chunk = splits[0]

        for split in tqdm(splits[1:], desc="splitting..."):

            if len(current_chunk) + len(split) + len(self.separator_pattern) > self.chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = split
            else:
                current_chunk += self.separator_pattern
                current_chunk += split

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks
from openai import OpenAI

class SimpleOpenAIEmbeddings:

    def embed_query(self, text):
        client = OpenAI()
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return response.data[0].embedding
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SimpleVectorStore:
    def __init__(self, docs, embedding):
        self.embedding = embedding
        self.documents = []
        self.vectors = []

        for doc in tqdm(docs, desc="embedding..."):
            self.documents.append(doc)
            vector = self.embedding.embed_query(doc)
            self.vectors.append(vector)

    def similarity_search(self, query, k=4):
        query_vector = self.embedding.embed_query(query)

        if not self.vectors:
            return []

        similarities = cosine_similarity([query_vector], self.vectors)[0]
        sorted_doc_similarities = sorted(zip(self.documents, similarities), key=lambda x: x[1], reverse=True)

        return sorted_doc_similarities[:k]

    def as_retriever(self, k=4):
        return SimpleRetriever(self, k)
class SimpleRetriever:
    def __init__(self, vector_store, k=4):
        self.vector_store = vector_store
        self.k = k

    def get_relevant_documents(self, query):
        docs = self.vector_store.similarity_search(query, self.k)
        return docs
raw_documents = SimpleTextLoader('state_of_the_union.txt').load()
text_splitter = SimpleCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = SimpleVectorStore(documents, SimpleOpenAIEmbeddings())
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0][0])
ywbestPark commented 2 months ago
import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/puzzlet/constitution-kr/master/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%20%ED%97%8C%EB%B2%95.txt",
    filename="korea_constitution.txt"
)
ywbestPark commented 2 months ago

import urllib.request

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt",
    filename="state_of_the_union.txt"
)