# pip install tika
from tika import parser
from typing import Any, Union, List, Literal
from swarmauri.parsers.base.ParserBase import ParserBase
from swarmauri.core.documents.IDocument import IDocument
from swarmauri.standard.documents.concrete.Document import Document
class TikaPDFParser(ParserBase):
"""
Parser for reading and extracting text from PDF files using Tika.
"""
type: Literal['TikaPDFParser'] = 'TikaPDFParser'
def parse(self, source: str) -> List[IDocument]:
parsed = parser.from_file(source)
text = parsed['content']
return [Document(content=text)]
Feature Name
swarmauri_community/parsers/concrete/TikaPDFParser.py
Feature Description
Using Tika, extract text from PDF files
Motivation
To enable parsing of pdf documents
Potential Solutions
Additional Context (optional)
No response
Affected Areas
None
Priority
Low
Required Files