Open kurtzace opened 1 week ago
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("tell me a joke about {topic}")
model = ChatAnthropic(model_name="claude-3-haiku-20240307")
chain = prompt | model | StrOutputParser()
result = chain.invoke({"topic": "bears"}) print(result)
- Router Chain (using a custom function):
Suppose you classify user questions as being about LangChain, Anthropic, or something else. Here’s an example of routing:
def route(info): if "anthropic" in info["topic"].lower(): return anthropic_chain elif "langchain" in info["topic"].lower(): return langchain_chain else: return general_chain
langchain_chain = ChatPromptTemplate.from_template("As Harrison Chase told me, {question}") anthropic_chain = ChatPromptTemplate.from_template("As Dario Amodei told me, {question}") general_chain = ChatPromptTemplate.from_template("{question}")
full_chain = {"topic": chain, "question": lambda x: x["question"]} | RunnableLambda(route)
result = full_chain.invoke({"question": "how do I use Anthropic?"}) print(result)
- math chain
from langchain_chains.llm_math.base import LLMMathChain from langchain_community.llms import OpenAI
llm_math = LLMMathChain.from_llm(OpenAI())
expression = "3 + 5 * 2" result = llm_math.invoke({"expression": expression}) print(f"Result of {expression}:", result)
- Transform Chain (example is sentiment)
from langchain_core.output_parsers import StrOutputParser
raw_text = "This product is amazing! Five stars."
cleaned_text = preprocess_text(raw_text) tokens = tokenize_text(cleaned_text) embeddings = convert_to_embeddings(tokens)
output_parser = StrOutputParser() result = output_parser.invoke({"embeddings": embeddings}) print(result)
or
Let’s create a transform chain that extracts relevant information from a text using regular expressions (regex) and then generates an NDA (Non-Disclosure Agreement) using an LLM (Language Model). We’ll follow these steps:
Extract Relevant Information using Regex: We’ll search for specific patterns (e.g., names, dates, confidential information) in a given text using Python’s re module.
import re from langchain_anthropic import ChatAnthropic
def extract_info(raw_text):
company_match = re.search(r"Company (\w+)", raw_text)
employee_match = re.search(r"Employee (\w+)", raw_text)
date_match = re.search(r"effective date is (\d{4}-\d{2}-\d{2})", raw_text)
if company_match and employee_match and date_match:
company_name = company_match.group(1)
employee_name = employee_match.group(1)
effective_date = date_match.group(1)
return company_name, employee_name, effective_date
else:
return None
raw_text = "This agreement is between Company X and Employee Y. The effective date is 2023-07-15."
info = extract_info(raw_text) if info: company_name, employee_name, effective_date = info
# Step 2: Generate NDA using LLM
nda_template = f"""
Non-Disclosure Agreement (NDA)
This Agreement ("Agreement") is entered into by and between {company_name} ("Company") and {employee_name} ("Employee") as of {effective_date}.
1. Purpose:
The purpose of this Agreement is to protect confidential information exchanged between Company and Employee during the course of their engagement.
2. Confidential Information:
Employee agrees not to disclose any confidential information obtained from Company, including but not limited to trade secrets, customer lists, and proprietary data.
3. Duration:
This Agreement shall remain in effect for a period of two years from the effective date.
Signed:
{company_name} (Company)
{employee_name} (Employee)
"""
# Use an LLM to fill in any missing details (e.g., boilerplate language)
model = ChatAnthropic(model_name="gpt-3.5-turbo")
nda_text = model.invoke({"text": nda_template})
print(nda_text)
else: print("Unable to extract relevant information from the raw text.")
## memory
- Conversation Buffer Memory
from langchain.chains import LLMChain from langchain.memory import ConversationBufferMemory from langchain_core.prompts import PromptTemplate from langchain_openai import OpenAI
template = """ You are a chatbot having a conversation with a human. {chat_history} Human: {human_input} Chatbot: """ prompt = PromptTemplate(input_variables=["chat_history", "human_input"], template=template) memory = ConversationBufferMemory(memory_key="chat_history") llm = OpenAI() llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True, memory=memory) llm_chain.predict(human_input="Hi there my friend")
- Chat Model with Memory
you can inject memory into the chat prompt.
from langchain_core.messages import SystemMessage from langchain_core.prompts import ChatPromptTemplate from langchain_openai import ChatOpenAI
prompt = ChatPromptTemplate.from_messages([ SystemMessage(content="You are a chatbot having a conversation with a human."),
])
- Conversation Summary Buffer Memory in the context of LLMChain. This memory type combines two ideas: it keeps a buffer of recent interactions, but instead of completely flushing old interactions, it compiles them into a summary and uses both. [Unlike previous implementations, it uses token length rather than the number of interactions to determine when to flush interactions](https://python.langchain.com/v0.1/docs/modules/memory/types/summary_buffer/)
from langchain.memory import ConversationSummaryBufferMemory from langchain.llms import OpenAI
llm = OpenAI() memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=2000)
memory.save_context({"input": "Hi"}, {"output": "What's up?"}) memory.save_context({"input": "Not much, you?"}, {"output": "Not much"})
memory.load_memory_variables({})
messages = memory.chat_memory.messages previous_summary = "" # Initialize with an empty summary new_summary = memory.predict_new_summary(messages, previous_summary)
<img width="463" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/aad61a8c-b29d-4cbb-8f0d-341dac078860">
<img width="844" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/684af46f-b4bb-4a84-a809-1eb62927fee9">
- Gradio gives chat interface
<img width="488" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/cbd7794d-6e9c-4870-8420-037f6e7bcf11">
## Agents
input->Agent-> agentFinish/AgentAction-> observation
access goog, wiki, recover from error, preprocessing,
select right tool: fine tuning/few shot learning
React synergizing acting in lang models. Zero shot react: solving math, gen creative content
[babyagi](https://github.com/yoheinakajima/babyagi) - choma vector db / create priority exe tasks
![image](https://user-images.githubusercontent.com/21254008/235015461-543a897f-70cc-4b63-941a-2ae3c9172b11.png)
may get into loops -
see paper Plan and Solve Prompting - Improving 0 shot learning by LLM - separates planning from executor (may result in more llm calls)
function calling - from openAI
Other agents:Selfask and search,React doc store: alternate to vector, Camel: complex games
<img width="231" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/6c74748d-f1e0-42a1-82ae-b6f3462602bd">
![image](https://github.com/kurtzace/diary-2024/assets/2136211/f9e1d5ae-e89c-48b0-bd0f-16ad604c6af0)
to write and exe python code
<img width="448" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/4a41aa08-1f2f-47a5-a688-b9aa3fcac4e5">
- Custom tools and conversation agents
own tool
<img width="279" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/21a15c7a-2826-46dd-8dc2-ce5f3ae41d7d">
agent plus memory
<img width="422" alt="image" src="https://github.com/kurtzace/diary-2024/assets/2136211/766ea40b-ab56-4b12-95f0-ad709a3a685c">
## Indexes and vector databases
- Doc loaders
- RAG retrieval aug generations
- vector pinecone or chroma
- Langsmith - prod readiness
### LangChain Document Loaders
page_content and metadata
TextLoader,WebPageLoader, YouTubeTranscriptLoader
from langchain.document_loaders import TextLoader loader = TextLoader("./data/sample.txt") document = loader.load()
loader = CSVLoader("./data/data.csv") documents = loader.load()
pdf
from langchain_community.document_loaders.pdf import UnstructuredPDFLoader
file_path = ["./example_data/whatsapp_chat.txt", "./example_data/layout-parser-paper.pdf"] loader = UnstructuredPDFLoader(file_path) docs = loader.load() print(docs[0].page_content[:400])
html
from langchain_community.document_loaders import UnstructuredHTMLLoader
file_path = "../../../docs/integrations/document_loaders/example_data/fake-content.html" loader = UnstructuredHTMLLoader(file_path) data = loader.load() print(data)
### pinecone
from langchain_community.document_loaders import TextLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import CharacterTextSplitter from langchain_pinecone import PineconeVectorStore
loader = TextLoader("../../modules/state_of_the_union.txt") documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings()
index_name = "langchain-test-index" docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)
query = "What did the president say about Ketanji Brown Jackson" docs = docsearch.similarity_search(query) print(docs[0].page_content)
chroma example
from langchain_community.document_loaders import TextLoader from langchain_openai import OpenAIEmbeddings from langchain_text_splitters import CharacterTextSplitter from langchain_chroma import ChromaVectorStore
loader = TextLoader("../../modules/state_of_the_union.txt") documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings()
index_path = "./chroma-index" docsearch = ChromaVectorStore.from_documents(docs, embeddings, index_path=index_path)
query = "What did the president say about Ketanji Brown Jackson" docs = docsearch.similarity_search(query) print(docs[0].page_content)
weviate AI native vector db
- comprehensive example
from langchain_community.document_loaders.pdf import PyPDFLoader
file_path = "example_data/layout-parser-paper.pdf" loader = PyPDFLoader(file_path) docs = loader.load_and_split()
from langchain_text_splitters import RecursiveCharacterTextSplitter
text = "What I Worked On ... (excerpt)" text_splitter = RecursiveCharacterTextSplitter(chunk_size=100) chunks = text_splitter.split_text(text)
from langchain_chroma import ChromaVectorStore
index_path = "./chroma-index" chroma_store = ChromaVectorStore(index_path)
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "Deci/DeciCoder-1b" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForCausalLM.from_pretrained(checkpoint) inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt") outputs = model.generate(inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0]))
Embeddings capture semantic meaning and context. For example, you can create text embeddings using Sentence Transformers.
starbase coder
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "bigcode/starcoderbase-1b" device = "cuda" # Use "cuda" for GPU or "cpu" for CPU tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
prompt = "Given a list of numbers, write a Python function that returns the sum of all even numbers in the list."
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device) outputs = model.generate(inputs)
generated_code = tokenizer.decode(outputs[0]) print("Generated code:") print(generated_code)
LangChain Development
by Tom Taulli
founder : Harrison chase, ankush gola
OPL (open ai, pinecone, langchain)
platform.openai.com/tokenizer
gpt-4: 8k or 32k
or
RNNS had vanishing grad, LSTM had limits
Attention is all you need (transformer model) - GPU/all tokens simultaneously, pretrained models
encoder (word embed, vectors, postional encoding) and decoder (masked muti head attention, auto regressive, mutihead )
Others
Disadvantages:
Numerous advantages
Models
fine tune
costs per token
e=tiktoken.get_encoding("cl100k_base")# tokens=e.encode(token_text)
openai's training datastructure is
{prompt:"..", completion:".."}
openai cli tool
Huggingfacehub_api_token
another prompt template example
and
Parser class : get_format_instructions(), parse(), parse_with_prompt
example use Csv to prompt and read
pydantic output parser