Closed jfernandogg closed 8 months ago
After researching decided to implement an updated version with kor. Here it is:
import enum
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from kor import create_extraction_chain, Object, Text, Number
import pydantic
from typing import List
from kor import from_pydantic
from pydantic import BaseModel, Field
from typing import Optional
from config import set_environment
set_environment()
llm = ChatOpenAI(
model_name="gpt-3.5-turbo",
temperature=0,
)
class Experience(BaseModel):
# the title doesn't seem to help at all.
start_date: Optional[str] = Field(description="When the job or study started.")
end_date: Optional[str] = Field(description="When the job or study ended.")
description: Optional[str] = Field(description="What the job or study entailed.")
country: Optional[str] = Field(description="The country of the institution.")
class Study(Experience):
degree: Optional[str] = Field(description="The degree obtained or expected.")
institution: Optional[str] = Field(
description="The university, college, or educational institution visited."
)
country: Optional[str] = Field(description="The country of the institution.")
grade: Optional[str] = Field(description="The grade achieved or expected.")
class WorkExperience(Experience):
company: str = Field(description="The company name of the work experience.")
job_title: Optional[str] = Field(description="The job title.")
class Resume(BaseModel):
first_name: Optional[str] = Field(description="The first name of the person.")
last_name: Optional[str] = Field(description="The last name of the person.")
linkedin_url: Optional[str] = Field(
description="The url of the linkedin profile of the person."
)
email_address: Optional[str] = Field(description="The email address of the person.")
nationality: Optional[str] = Field(description="The nationality of the person.")
skill: Optional[str] = Field(description="A skill listed or mentioned in a description.")
study: Optional[Study] = Field(
description="A study that the person completed or is in progress of completing."
)
work_experience: Optional[WorkExperience] = Field(
description="A work experience of the person."
)
hobby: Optional[str] = Field(description="A hobby or recreational activity of the person.")
# Extract pdf information
pdf_loader = PyPDFLoader("openresume-resume.pdf")
docs = pdf_loader.load_and_split()
model_json_schema, validator = from_pydantic(Resume)
chain = create_extraction_chain(
llm, model_json_schema, encoder_or_encoder_class="json", validator=validator
)
print(chain.invoke(docs[0].page_content)["text"]["data"] )
Dependencies versions (install.txt):
aiohttp==3.9.3
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
async-timeout==4.0.3
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
dataclasses-json==0.6.4
distro==1.9.0
exceptiongroup==1.2.0
frozenlist==1.4.1
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
idna==3.6
jsonpatch==1.33
jsonpointer==2.4
kor==1.0.1
langchain==0.1.12
langchain-community==0.0.28
langchain-core==0.1.32
langchain-openai==0.0.8
langchain-text-splitters==0.0.1
langsmith==0.1.27
marshmallow==3.21.1
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.14.1
orjson==3.9.15
packaging==23.2
pandas==1.5.3
pydantic==2.6.4
pydantic_core==2.16.3
pypdf==4.1.0
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
regex==2023.12.25
requests==2.31.0
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.28
tenacity==8.2.3
tiktoken==0.6.0
tqdm==4.66.2
typing==3.7.4.3
typing-inspect==0.9.0
typing_extensions==4.10.0
urllib3==2.2.1
yarl==1.9.4
In informationextraction/__init_\.py There is always an error triggered because Resume class wants a validator. It doesn't matter if arbitrary_types_allowed=True is set or is with the original code. Seems like an incompatibility issue. Tried to fix with:
But the error is always the same: