Open avisheka93 opened 3 months ago
To extract both text and tables from PDFs, chunk them, and send them to a vector store while maintaining data quality, you can use the SmartPDFLoader
class from the LlamaIndex library. This class uses nested layout information such as sections, paragraphs, lists, and tables to smartly chunk PDFs for optimal usage of LLM context windows.
Here is an example of how you can use the SmartPDFLoader
:
from llama_index.readers.smart_pdf_loader.base import SmartPDFLoader
# Initialize the SmartPDFLoader with the URL of the llmsherpa PDF parser service
pdf_loader = SmartPDFLoader(llmsherpa_api_url="http://your-llmsherpa-api-url")
# Load data from a PDF file or URL
documents = pdf_loader.load_data(pdf_path_or_url="path/to/your/pdf")
# Process the documents as needed, for example, sending them to a vector store
for document in documents:
# Your code to send the document to a vector store
pass
Additionally, if you need to handle both text and tables effectively, you might want to consider using the PDFReader
from LlamaHub, which can be integrated with LlamaIndex. Here is an example of how you can use it:
from llama_index import GPTSimpleVectorIndex, LLMPredictor, ServiceContext
from llama_index.readers import download_loader
from pathlib import Path
import os
INDEX_FILE = "index.json"
def ask(file):
print("Loading...")
PDFReader = download_loader("PDFReader")
loader = PDFReader()
documents = loader.load_data(file=Path(file))
print("Path: ", Path(file))
# Check if the index file exists
if os.path.exists(INDEX_FILE):
# Load the index from the file
index = GPTSimpleVectorIndex.load_from_disk(INDEX_FILE)
else:
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=1024)
index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)
# Save the index to the file
index.save_to_disk(INDEX_FILE)
For a fully open-source stack, you can use Sentence Transformers as the embedding model, Postgres as the vector store, and Llama 2 as the LLM. Here is an example setup:
# Install necessary packages
%pip install llama-index-readers-file pymupdf
%pip install llama-index-vector-stores-postgres
%pip install llama-index-embeddings-huggingface
%pip install llama-index-llms-llama-cpp
# Sentence Transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
# Llama CPP
!pip install llama-cpp-python
from llama_index.llms.llama_cpp import LlamaCPP
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"
llm = LlamaCPP(model_url=model_url, temperature=0.1, max_new_tokens=256, context_window=3900, model_kwargs={"n_gpu_layers": 1}, verbose=True)
# Initialize Postgres
!pip install psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet
import psycopg2
db_name = "vector_db"
host = "localhost"
password = "password"
port = "5432"
user = "jerry"
conn = psycopg2.connect(dbname="postgres", host=host, password=password, port=port, user=user)
conn.autocommit = True
with conn.cursor() as c:
c.execute(f"DROP DATABASE IF EXISTS {db_name}")
c.execute(f"CREATE DATABASE {db_name}")
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore
vector_store = PGVectorStore.from_params(database=db_name, host=host, password=password, port=port, user=user, table_name="llama2_paper", embed_dim=384)
# Load Data
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")
# Use a Text Splitter to Split Documents
from llama_index.core.node_parser import SentenceSplitter
text_parser = SentenceSplitter(chunk_size=1024)
text_chunks = []
doc_idxs = []
for doc_idx, doc in enumerate(documents):
cur_text_chunks = text_parser.split_text(doc.text)
text_chunks.extend(cur_text_chunks)
doc_idxs.extend([doc_idx] * len(cur_text_chunks))
# Manually Construct Nodes from Text Chunks
from llama_index.core.schema import TextNode
nodes = []
for idx, text_chunk in enumerate(text_chunks):
node = TextNode(text=text_chunk)
src_doc = documents[doc_idxs[idx]]
node.metadata = src_doc.metadata
nodes.append(node)
# Generate Embeddings for each Node
for node in nodes:
node_embedding = embed_model.get_text_embedding(node.get_content(metadata_mode="all"))
node.embedding = node_embedding
# Load Nodes into a Vector Store
vector_store.add(nodes)
# Build Retrieval Pipeline
query_str = "Can you tell me about the key concepts for safety finetuning"
query_embedding = embed_model.get_query_embedding(query_str)
from llama_index.core.vector_stores import VectorStoreQuery
vector_store_query = VectorStoreQuery(query_embedding=query_embedding, similarity_top_k=2, mode="default")
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())
# Parse Result into a Set of Nodes
from llama_index.core.schema import NodeWithScore
nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
score = query_result.similarities[index] if query_result.similarities is not None else None
nodes_with_scores.append(NodeWithScore(node=node, score=score))
# Put into a Retriever
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
class VectorDBRetriever(BaseRetriever):
def __init__(self, vector_store, embed_model, query_mode="default", similarity_top_k=2):
self._vector_store = vector_store
self._embed_model = embed_model
self._query_mode = query_mode
self._similarity_top_k = similarity_top_k
super().__init__()
def _retrieve(self, query_bundle):
query_embedding = embed_model.get_query_embedding(query_bundle.query_str)
vector_store_query = VectorStoreQuery(query_embedding=query_embedding, similarity_top_k=self._similarity_top_k, mode=self._query_mode)
query_result = vector_store.query(vector_store_query)
nodes_with_scores = [NodeWithScore(node=node, score=query_result.similarities[index] if query_result.similarities is not None else None) for index, node in enumerate(query_result.nodes)]
return nodes_with_scores
retriever = VectorDBRetriever(vector_store, embed_model, query_mode="default", similarity_top_k=2)
# Plug this into our RetrieverQueryEngine to synthesize a response
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)
query_str = "How does Llama 2 perform compared to other open-source models?"
response = query_engine.query(query_str)
print(str(response))
print(response.source_nodes[0].get_content())
This approach ensures that both text and table data are effectively extracted and indexed, maintaining high data quality. If you encounter issues with table extraction, you might need to explore additional tools or custom solutions to handle table data more accurately [1][2][3][4].
To continue talking to Dosu, mention @dosu.
@dosu how about pdftablereader
To use pdftablereader
to extract both text and tables from PDFs, maintain data quality, chunk them, and send them to a vector store, you can follow these steps:
PDFTableReader
class to extract tables from the PDF.Document
objects.Here is a sample implementation:
from pathlib import Path
from typing import List, Optional, Dict
import pandas as pd
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
import camelot
class PDFTableReader(BaseReader):
def __init__(self, *args: Any, row_separator: str = "\n", col_separator: str = ", ", **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._row_separator = row_separator
self._col_separator = col_separator
def load_data(self, file: Path, pages: str = "1", extra_info: Optional[Dict] = None) -> List[Document]:
results = []
tables = camelot.read_pdf(filepath=str(file), pages=pages)
for table in tables:
document = self._dataframe_to_document(df=table.df, extra_info=extra_info)
results.append(document)
return results
def _dataframe_to_document(self, df: pd.DataFrame, extra_info: Optional[Dict] = None) -> Document:
df_list = df.apply(lambda row: (self._col_separator).join(row.astype(str).tolist()), axis=1).tolist()
return Document(text=self._row_separator.join(df_list), extra_info=extra_info or {})
# Example usage
pdf_reader = PDFTableReader()
documents = pdf_reader.load_data(file=Path("example.pdf"), pages="1")
# Chunking the data (example function, implement as needed)
def chunk_documents(documents: List[Document], chunk_size: int) -> List[Document]:
chunked_docs = []
for doc in documents:
text = doc.text
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
for chunk in chunks:
chunked_docs.append(Document(text=chunk, extra_info=doc.extra_info))
return chunked_docs
chunked_documents = chunk_documents(documents, chunk_size=1000)
# Send to vector store (pseudo-code, replace with actual vector store API)
def send_to_vector_store(documents: List[Document]):
for doc in documents:
# Replace with actual API call to your vector store
vector_store_api.add_document(doc.text, extra_info=doc.extra_info)
send_to_vector_store(chunked_documents)
This code demonstrates how to extract tables from a PDF, convert them to Document
objects, chunk the data, and send it to a vector store. Adjust the chunking logic and vector store API calls as per your specific requirements [1].
To continue talking to Dosu, mention @dosu.
@dosu kindly explain and show if UnstructuredElementNodeParser can be used
Yes, the UnstructuredElementNodeParser
can be used for extracting and processing both text and tables from PDFs. Here is an example of how you might use it:
from llama_index.legacy.node_parser.relational.unstructured_element import UnstructuredElementNodeParser
from llama_index.legacy.schema import TextNode
# Initialize the parser
parser = UnstructuredElementNodeParser()
# Example text node containing PDF content
text_node = TextNode(content="Your PDF content here")
# Extract nodes from the text node
nodes = parser.get_nodes_from_node(text_node)
# Process the extracted nodes
for node in nodes:
print(f"Node Type: {node.type}")
print(f"Node Content: {node.get_content()}")
if node.type == "table":
print(f"Table DataFrame: {node.table}")
This example demonstrates initializing the UnstructuredElementNodeParser
, creating a TextNode
with PDF content, extracting nodes from the text node, and processing the extracted nodes. The parser uses the unstructured
and lxml
libraries to handle the extraction and processing of text and table elements [1][2].
To use the UnstructuredElementNodeParser
, you need to install the required dependencies:
pip install unstructured lxml
These packages are necessary for the parser to function correctly [1][2].
The UnstructuredElementNodeParser
primarily handles text and tables in PDFs. It does not explicitly mention handling other types of embedded objects such as images or charts [2].
To continue talking to Dosu, mention @dosu.
@avisheka93 also, you could try llama-parse as well. It has a JSON response mode that returns individual elements
Hi my pdf is a ncert book it contain text, table and images now how to extract it and store in vectore store?Can you help me with this by a example
Question Validation
Question
my pdfs has both text and tables i need to extract both seperately to maintain data quality and chunk them and send them to vector store what can be used to extract both text and tables and properly vectorize them