eunja511005 / AutoCoding

0 stars 0 forks source link

허깅 페이스 사용해 보기 #189

Open eunja511005 opened 2 months ago

eunja511005 commented 2 months ago

https://www.linkedin.com/pulse/exploring-googles-gemma-2b-new-era-language-models-rany-lxhlc image

https://limitsinx.tistory.com/67 ④ GPU 환경세팅 : 런타임 -> 런타임 유형 변경 -> T4 GPU 선택 후 저장

eunja511005 commented 2 months ago
!pip install langchain
!pip install chromadb
!pip install sentence-transformers

import os
from google.colab import userdata
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

hf_token = userdata.get('HF_TOKEN')

loader = WebBaseLoader(
    web_paths=("https://raw.githubusercontent.com/puzzlet/constitution-kr/master/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%20%ED%97%8C%EB%B2%95.txt",),
)

raw_documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(raw_documents )
len(splits)

raw_documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(raw_documents )
len(splits)

# embeddings = HuggingFaceEmbeddings()
embeddings = HuggingFaceEmbeddings(model_name="google/gemma-1.1-2b-it")
db = Chroma.from_documents(splits, embeddings)

retriver = db.as_retriever(search_kwargs={"k": 5})

relevants = retriver.get_relevant_documents("대통령 임기는?")

i = 1
for relevant in relevants:
    print(f"{i}번째 페이지 : ")
    print("***" * 20)
    print(relevant.page_content)
    i += 1