benman1 / generative_ai_with_langchain

Build large language model (LLM) apps with Python, ChatGPT and other models. This is the companion repository for the book on generative AI with LangChain.
https://amzn.to/43PuIkQ
MIT License
552 stars 219 forks source link

information_extraction/__init__.py all ways want a validator on BaseModel classes #32

Closed jfernandogg closed 4 months ago

jfernandogg commented 4 months ago

In informationextraction/__init_\.py There is always an error triggered because Resume class wants a validator. It doesn't matter if arbitrary_types_allowed=True is set or is with the original code. Seems like an incompatibility issue. Tried to fix with:

class Resume(BaseModel):
    class Config:
        arbitrary_types_allowed = False
    first_name: Optional[str] = Field(None, description="The first name of the person.")
    last_name: Optional[str] = Field(None, description="The last name of the person.")

But the error is always the same:

File "/workspaces/generative_ai_with_langchain/information_extraction/__init__.py", line 72, in <module>
    print(parse_cv(
          ^^^^^^^^^
  File "/workspaces/generative_ai_with_langchain/information_extraction/__init__.py", line 67, in parse_cv
    chain = create_extraction_chain_pydantic(pydantic_schema=Resume, llm=llm)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/langchain/chains/openai_functions/extraction.py", line 101, in create_extraction_chain_pydantic
    class PydanticSchema(BaseModel):
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/main.py", line 197, in __new__
    fields[ann_name] = ModelField.infer(
                       ^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 504, in infer
    return cls(
           ^^^^
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 434, in __init__
    self.prepare()
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 550, in prepare
    self._type_analysis()
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 756, in _type_analysis
    self.sub_fields = [self._create_sub_type(self.type_, '_' + self.name)]
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 806, in _create_sub_type
    return self.__class__(
           ^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 434, in __init__
    self.prepare()
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 555, in prepare
    self.populate_validators()
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/fields.py", line 829, in populate_validators
    *(get_validators() if get_validators else list(find_validators(self.type_, self.model_config))),
                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/pydantic/v1/validators.py", line 765, in find_validators
    raise RuntimeError(f'no validator found for {type_}, see `arbitrary_types_allowed` in Config')
RuntimeError: no validator found for <class '__main__.Resume'>, see `arbitrary_types_allowed` in Config
jfernandogg commented 4 months ago

After researching decided to implement an updated version with kor. Here it is:

import enum
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from kor import create_extraction_chain, Object, Text, Number
import pydantic
from typing import List
from kor import from_pydantic
from pydantic import BaseModel, Field
from typing import Optional
from config import set_environment

set_environment()

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
)

class Experience(BaseModel):
    # the title doesn't seem to help at all.
    start_date: Optional[str] = Field(description="When the job or study started.")
    end_date: Optional[str] = Field(description="When the job or study ended.")
    description: Optional[str] = Field(description="What the job or study entailed.")
    country: Optional[str] = Field(description="The country of the institution.")

class Study(Experience):
    degree: Optional[str] = Field(description="The degree obtained or expected.")
    institution: Optional[str] = Field(
        description="The university, college, or educational institution visited."
    )
    country: Optional[str] = Field(description="The country of the institution.")
    grade: Optional[str] = Field(description="The grade achieved or expected.")

class WorkExperience(Experience):
    company: str = Field(description="The company name of the work experience.")
    job_title: Optional[str] = Field(description="The job title.")

class Resume(BaseModel):
    first_name: Optional[str] = Field(description="The first name of the person.")
    last_name: Optional[str] = Field(description="The last name of the person.")
    linkedin_url: Optional[str] = Field(
        description="The url of the linkedin profile of the person."
    )
    email_address: Optional[str] = Field(description="The email address of the person.")
    nationality: Optional[str] = Field(description="The nationality of the person.")
    skill: Optional[str] = Field(description="A skill listed or mentioned in a description.")
    study: Optional[Study] = Field(
        description="A study that the person completed or is in progress of completing."
    )
    work_experience: Optional[WorkExperience] = Field(
        description="A work experience of the person."
    )
    hobby: Optional[str] = Field(description="A hobby or recreational activity of the person.")

# Extract pdf information
pdf_loader = PyPDFLoader("openresume-resume.pdf")
docs = pdf_loader.load_and_split()

model_json_schema, validator = from_pydantic(Resume)
chain = create_extraction_chain(
    llm, model_json_schema, encoder_or_encoder_class="json", validator=validator
)

print(chain.invoke(docs[0].page_content)["text"]["data"] )

Dependencies versions (install.txt):

aiohttp==3.9.3
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
async-timeout==4.0.3
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
dataclasses-json==0.6.4
distro==1.9.0
exceptiongroup==1.2.0
frozenlist==1.4.1
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.4
httpx==0.27.0
idna==3.6
jsonpatch==1.33
jsonpointer==2.4
kor==1.0.1
langchain==0.1.12
langchain-community==0.0.28
langchain-core==0.1.32
langchain-openai==0.0.8
langchain-text-splitters==0.0.1
langsmith==0.1.27
marshmallow==3.21.1
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.14.1
orjson==3.9.15
packaging==23.2
pandas==1.5.3
pydantic==2.6.4
pydantic_core==2.16.3
pypdf==4.1.0
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
regex==2023.12.25
requests==2.31.0
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.28
tenacity==8.2.3
tiktoken==0.6.0
tqdm==4.66.2
typing==3.7.4.3
typing-inspect==0.9.0
typing_extensions==4.10.0
urllib3==2.2.1
yarl==1.9.4