when i run the auto_dataframe.py sample get the error like this

cumthxy commented 9 months ago

Traceback (most recent call last): File "/Users/huangxingyu/daily/chatgpt/1.py", line 65, in df = dataframe( File "/Users/huangxingyu/daily/chatgpt/1.py", line 61, in dataframe return Dataframe.from_response(completion) File "/Users/huangxingyu/anaconda3/envs/python3.9/lib/python3.9/site-packages/instructor/function_calls.py", line 138, in from_response return cls.model_validate_json( File "/Users/huangxingyu/anaconda3/envs/python3.9/lib/python3.9/site-packages/pydantic/main.py", line 532, in model_validate_json return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context) pydantic_core._pydantic_core.ValidationError: 4 validation errors for Dataframe data.0 Input should be an object [type=model_type, input_value=['John', 25, 'New York', 'basketball'], input_type=list] For further information visit https://errors.pydantic.dev/2.5/v/model_type data.1 Input should be an object [type=model_type, input_value=['Mike', 30, 'San Francisco', 'baseball'], input_type=list] For further information visit https://errors.pydantic.dev/2.5/v/model_type data.2 Input should be an object [type=model_type, input_value=['Sarah', 20, 'Los Angeles', 'tennis'], input_type=list] For further information visit https://errors.pydantic.dev/2.5/v/model_type data.3 Input should be an object [type=model_type, input_value=['Mary', 35, 'Chicago', None], input_type=list] For further information visit https://errors.pydantic.dev/2.5/v/model_type

jxnl commented 9 months ago

share prompt, i can't do anything with this.

cumthxy commented 9 months ago

share prompt, i can't do anything with this.

this code https://github.com/jxnl/instructor/blob/main/examples/automatic_dataframe_extraction/auto_dataframe.py

jxnl commented 9 months ago

try using this instead, https://github.com/jxnl/instructor/blob/main/examples/vision/run_table.py ! I'lll delete this example and replace it with markdown

jxnl commented 9 months ago

from openai import OpenAI
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
    BaseModel,
    BeforeValidator,
    PlainSerializer,
    InstanceOf,
    WithJsonSchema,
)
import pandas as pd
from tomlkit import table
import instructor

client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)

def md_to_df(data: Any) -> Any:
    if isinstance(data, str):
        return (
            pd.read_csv(
                StringIO(data),  # Get rid of whitespaces
                sep="|",
                index_col=1,
            )
            .dropna(axis=1, how="all")
            .iloc[1:]
            .map(lambda x: x.strip())
        )
    return data

MarkdownDataFrame = Annotated[
    InstanceOf[pd.DataFrame],
    BeforeValidator(md_to_df),
    PlainSerializer(lambda x: x.to_markdown()),
    WithJsonSchema(
        {
            "type": "string",
            "description": """
                The markdown representation of the table, 
                each one should be tidy, do not try to join tables
                that should be seperate""",
        }
    ),
]

class Table(BaseModel):
    caption: str
    dataframe: MarkdownDataFrame

client = instructor.patch(OpenAI())

tables = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Iterable[Table],
    messages=[
        {
            "role": "system",
            "content": "Please extract the tables from the following text, merge as much as possible:",
        },
        {
            "role": "user",
            "content": """
        My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago.
        """,
        },
    ],
)

for table in tables:
    print(table.caption)
    print(table.dataframe)
    print()
    """
    People
            Age           City       Hobby 
    Name                                   
    John      25       New York  Basketball
    Mike      30  San Francisco    Baseball
    Sarah     20    Los Angeles      Tennis
    Mary      35        Chicago         N/A
    """

thanks for finding the regression, going to delete the example to use this one isntead.

cumthxy commented 9 months ago

from openai import OpenAI
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
    BaseModel,
    BeforeValidator,
    PlainSerializer,
    InstanceOf,
    WithJsonSchema,
)
import pandas as pd
import instructor

client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)

def md_to_df(data: Any) -> Any:
    if isinstance(data, str):
        return (
            pd.read_csv(
                StringIO(data),  # Get rid of whitespaces
                sep="|",
                index_col=1,
            )
            .dropna(axis=1, how="all")
            .iloc[1:]
            .map(lambda x: x.strip())
        )
    return data

MarkdownDataFrame = Annotated[
    InstanceOf[pd.DataFrame],
    BeforeValidator(md_to_df),
    PlainSerializer(lambda x: x.to_markdown()),
    WithJsonSchema(
        {
            "type": "string",
            "description": """
                The markdown representation of the table, 
                each one should be tidy, do not try to join tables
                that should be seperate""",
        }
    ),
]

class Table(BaseModel):
    caption: str
    dataframe: MarkdownDataFrame

client = instructor.patch(OpenAI())

tables = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Iterable[Table],
    messages=[
        {
            "role": "system",
            "content": "Please extract the tables from the following text, merge as much as possible:",
        },
        {
            "role": "user",
            "content": """
        My name is John and I am 25 years old. I live in 
        New York and I like to play basketball. His name is 
        Mike and he is 30 years old. He lives in San Francisco 
        and he likes to play baseball. Sarah is 20 years old 
        and she lives in Los Angeles. She likes to play tennis.
        Her name is Mary and she is 35 years old. 
        She lives in Chicago.
        """,
        },
    ],
)

for table in tables:
    print(table.caption)
    print(table.dataframe)
    print()
    """
    People
            Age           City       Hobby 
    Name                                   
    John      25       New York  Basketball
    Mike      30  San Francisco    Baseball
    Sarah     20    Los Angeles      Tennis
    Mary      35        Chicago         N/A
    """

thanks for finding the regression, going to delete the example to use this one isntead.

why i get error in the old code?

jxnl commented 9 months ago

probably the llm changed

instructor-ai / instructor

when i run the auto_dataframe.py sample get the error like this #324