refuel-ai / autolabel

Label, clean and enrich text datasets with LLMs.
https://docs.refuel.ai/
MIT License
2.1k stars 149 forks source link

[Bug]: pydantic isn't working with main, neither v2 nor v1 #932

Open turian opened 2 weeks ago

turian commented 2 weeks ago

Describe the bug

To Reproduce A list of steps to reproduce the behavior:

1.10.9:

!pip uninstall -y refuel-autolabel
!git clone https://github.com/refuel-ai/autolabel.git
!cd autolabel && pip install '.[openai,anthropic]'
!pip install     "loguru >= 0.5.0" "numpy == 1.26.4" "requests >= 2.27.0" "datasets >= 2.7.0" "langchain == 0.2.16" "nervaluate >= 0.1.8" "pandas >= 1.3.0" "scikit-learn >= 1.0.0" "tenacity >= 8.2.2" "SQLAlchemy >= 2.0.19" "regex >= 2023.6.3" "rich >= 13.3.5" "scipy >= 1.10.1" "pydantic == 1.10.9" "wget >= 3.2" "ipywidgets == 8.0.6" "jsonschema >= 4.17.3" "tabulate >= 0.9.0" "typer[all] >= 0.9.0" "simple-term-menu >= 1.6.1" "transformers >= 4.25.0" "html2text == 2020.1.16" "pylcs == 0.1.1" "openai == 1.45.0" "tiktoken >= 0.7.0" "anthropic == 0.34.2"
!pip install json5

from autolabel import LabelingAgent

gives:

ImportError                               Traceback (most recent call last)
Cell In[4], [line 1](vscode-notebook-cell:?execution_count=4&line=1)
----> [1](vscode-notebook-cell:?execution_count=4&line=1) from autolabel import LabelingAgent

File /opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:3
      [1](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:1) from importlib import metadata
----> [3](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:3) from .labeler import LabelingAgent
      [4](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:4) from .utils import get_data
      [5](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:5) from .dataset import AutolabelDataset

File /opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:16
     [13](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:13) from tqdm import tqdm
     [14](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:14) from transformers import AutoTokenizer
---> [16](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:16) from autolabel.cache import (
     [17](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:17)     BaseCache,
     [18](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:18)     SQLAlchemyConfidenceCache,
     [19](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:19)     SQLAlchemyGenerationCache,
     [20](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:20)     SQLAlchemyTransformCache,
     [21](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:21) )
     [22](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:22) from autolabel.confidence import ConfidenceCalculator
     [23](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:23) from autolabel.configs import AutolabelConfig

File /opt/homebrew/lib/python3.11/site-packages/autolabel/cache/__init__.py:3
      [1](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/cache/__init__.py:1) from .base import BaseCache
...
---> [12](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/langchain_community/utilities/serpapi.py:12) from pydantic import BaseModel, ConfigDict, Field, model_validator
     [15](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/langchain_community/utilities/serpapi.py:15) class HiddenPrints:
     [16](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/langchain_community/utilities/serpapi.py:16)     """Context manager to hide prints."""

ImportError: cannot import name 'model_validator' from 'pydantic' (/opt/homebrew/lib/python3.11/site-packages/pydantic/__init__.cpython-311-darwin.so)

and pydantic 2.9:

!pip uninstall -y refuel-autolabel
!git clone https://github.com/refuel-ai/autolabel.git
!cd autolabel && pip install '.[openai,anthropic]'
!pip install     "loguru >= 0.5.0" "numpy == 1.26.4" "requests >= 2.27.0" "datasets >= 2.7.0" "langchain == 0.2.16" "nervaluate >= 0.1.8" "pandas >= 1.3.0" "scikit-learn >= 1.0.0" "tenacity >= 8.2.2" "SQLAlchemy >= 2.0.19" "regex >= 2023.6.3" "rich >= 13.3.5" "scipy >= 1.10.1" "pydantic == 1.10.9" "wget >= 3.2" "ipywidgets == 8.0.6" "jsonschema >= 4.17.3" "tabulate >= 0.9.0" "typer[all] >= 0.9.0" "simple-term-menu >= 1.6.1" "transformers >= 4.25.0" "html2text == 2020.1.16" "pylcs == 0.1.1" "openai == 1.45.0" "tiktoken >= 0.7.0" "anthropic == 0.34.2"
!pip install --upgrade "pydantic>=2.8.2"
!pip install json5

from autolabel import LabelingAgent

gives:

----> [1](vscode-notebook-cell:?execution_count=3&line=1) from autolabel import LabelingAgent

File /opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:3
      [1](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:1) from importlib import metadata
----> [3](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:3) from .labeler import LabelingAgent
      [4](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:4) from .utils import get_data
      [5](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/__init__.py:5) from .dataset import AutolabelDataset

File /opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:16
     [13](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:13) from tqdm import tqdm
     [14](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:14) from transformers import AutoTokenizer
---> [16](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:16) from autolabel.cache import (
     [17](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:17)     BaseCache,
     [18](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:18)     SQLAlchemyConfidenceCache,
     [19](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:19)     SQLAlchemyGenerationCache,
     [20](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:20)     SQLAlchemyTransformCache,
     [21](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:21) )
     [22](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:22) from autolabel.confidence import ConfidenceCalculator
     [23](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:23) from autolabel.configs import AutolabelConfig

File /opt/homebrew/lib/python3.11/site-packages/autolabel/cache/__init__.py:3
      [1](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/cache/__init__.py:1) from .base import BaseCache
...
    [445](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_model_construction.py:445)     ):

PydanticUserError: A non-annotated attribute was detected: `DEFAULT_ORGANIC_RESULTS_KEYS = ['position', 'title', 'link', 'snippet']`. All model fields require a type annotation; if `DEFAULT_ORGANIC_RESULTS_KEYS` is not meant to be a field, you may be able to resolve this error by annotating it as a `ClassVar` or updating `model_config['ignored_types']`.

For further information visit https://errors.pydantic.dev/2.9/u/model-field-missing-annotation

Expected behavior pyproject.yml should have a version of pydantic known to work.

turian commented 2 weeks ago

I managed to resolve this with:

!pip install scrapingbee langchain_openai "langchain_community<=0.2.16" "langchain_core<=0.2.16"
!pip install 'pydantic==1.10.14'
!pip install json5

However, autolabel main now gives me this error :(

Cell In[5], [line 8](vscode-notebook-cell:?execution_count=5&line=8)
      [5](vscode-notebook-cell:?execution_count=5&line=5) model_name = config["model"]["name"]
      [7](vscode-notebook-cell:?execution_count=5&line=7) # create an agent for labeling
----> [8](vscode-notebook-cell:?execution_count=5&line=8) agent = LabelingAgent(config=config)

File /opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:117, in LabelingAgent.__init__(self, config, cache, example_selector, label_selector_map, console_output, generation_cache, transform_cache, confidence_cache, confidence_tokenizer, confidence_endpoint, use_tqdm)
    [112](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:112) self.use_tqdm = use_tqdm
    [114](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:114) self.config = (
    [115](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:115)     config if isinstance(config, AutolabelConfig) else AutolabelConfig(config)
    [116](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:116) )
--> [117](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:117) self.task = TaskFactory.from_config(self.config)
    [118](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:118) self.llm: BaseModel = ModelFactory.from_config(
    [119](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:119)     self.config, cache=self.generation_cache, tokenizer=confidence_tokenizer
    [120](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:120) )
    [122](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/labeler.py:122) if self.config.confidence_chunk_column():

File /opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:22, in TaskFactory.from_config(config)
     [20](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:20) try:
     [21](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:21)     task_type = TaskType(config.task_type())
---> [22](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:22)     task_cls = TASK_TYPE_TO_IMPLEMENTATION[task_type]
     [23](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:23)     return task_cls(config)
     [24](https://file+.vscode-resource.vscode-cdn.net/opt/homebrew/lib/python3.11/site-packages/autolabel/tasks/__init__.py:24) except ValueError as _:

KeyError: <TaskType.CLASSIFICATION: 'classification'>