Unstructured-IO / unstructured-api

Apache License 2.0
498 stars 106 forks source link

TypeError: __init__() got an unexpected keyword argument 'detection_class_prob' #237

Closed Jimchoo91 closed 12 months ago

Jimchoo91 commented 1 year ago

I tried uploading a 50 page PDF using the API, and received this error?

TypeError: init() got an unexpected keyword argument 'detection_class_prob'

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-199-16b05c99d0ba> in <module>
     10 )
     11 
---> 12 docs = loader.load()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in load(self)
     84     def load(self) -> List[Document]:
     85         """Load file."""
---> 86         elements = self._get_elements()
     87         if self.mode == "elements":
     88             docs: List[Document] = list()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in _get_elements(self)
    267 
    268     def _get_elements(self) -> List:
--> 269         return get_elements_from_api(
    270             file_path=self.file_path,
    271             api_key=self.api_key,

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in get_elements_from_api(file_path, file, api_url, api_key, **unstructured_kwargs)
    202         from unstructured.partition.api import partition_via_api
    203 
--> 204         return partition_via_api(
    205             filename=file_path,
    206             file=file,

/opt/anaconda3/lib/python3.8/site-packages/unstructured/partition/api.py in partition_via_api(filename, content_type, file, file_filename, api_url, api_key, **request_kwargs)
     86                 "metadata_filename must be specified as well.",
     87             )
---> 88         files = [
     89             ("files", (metadata_filename, file, content_type)),  # type: ignore
     90         ]

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in elements_from_json(filename, text, encoding)
    117     """Loads a list of elements from a JSON file or a string."""
    118     exactly_one(filename=filename, text=text)
--> 119 
    120     if filename:
    121         with open(filename, encoding=encoding) as f:

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in dict_to_elements(element_dict)
    100                     metadata=metadata,
    101                 ),
--> 102             )
    103 
    104     return elements

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in isd_to_elements(isd)
     75 def isd_to_elements(isd: List[Dict[str, Any]]) -> List[Element]:
     76     """Converts an Initial Structured Data (ISD) dictionary to a list of elements."""
---> 77     elements: List[Element] = []
     78 
     79     for item in isd:

/opt/anaconda3/lib/python3.8/site-packages/unstructured/documents/elements.py in from_dict(cls, input_dict)
    171         if isinstance(self.filename, pathlib.Path):
    172             self.filename = str(self.filename)
--> 173 
    174         if self.filename is not None:
    175             file_directory, filename = os.path.split(self.filename)

Anyone seen this before?

awalker4 commented 1 year ago

Hi there, we identified a breaking change in the latest version of the library. We've rolled back the hosted api while we work on a fix.

Jimchoo91 commented 1 year ago

Just to let you know, there is now a new error when using the UnstructuredAPIFileLoader in LangChain which seems related:

TypeError                                 Traceback (most recent call last)
<ipython-input-361-d7f06958231b> in <module>
      9 )
     10 
---> 11 chev_docs_1 = loader.load()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in load(self)
     84     def load(self) -> List[Document]:
     85         """Load file."""
---> 86         elements = self._get_elements()
     87         if self.mode == "elements":
     88             docs: List[Document] = list()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in _get_elements(self)
    267 
    268     def _get_elements(self) -> List:
--> 269         return get_elements_from_api(
    270             file_path=self.file_path,
    271             api_key=self.api_key,

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in get_elements_from_api(file_path, file, api_url, api_key, **unstructured_kwargs)
    202         from unstructured.partition.api import partition_via_api
    203 
--> 204         return partition_via_api(
    205             filename=file_path,
    206             file=file,

/opt/anaconda3/lib/python3.8/site-packages/unstructured/partition/api.py in partition_via_api(filename, content_type, file, file_filename, api_url, api_key, **request_kwargs)
     86                 "metadata_filename must be specified as well.",
     87             )
---> 88         files = [
     89             ("files", (metadata_filename, file, content_type)),  # type: ignore
     90         ]

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in elements_from_json(filename, text, encoding)
    117     """Loads a list of elements from a JSON file or a string."""
    118     exactly_one(filename=filename, text=text)
--> 119 
    120     if filename:
    121         with open(filename, encoding=encoding) as f:

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in dict_to_elements(element_dict)
    100                     metadata=metadata,
    101                 ),
--> 102             )
    103 
    104     return elements

/opt/anaconda3/lib/python3.8/site-packages/unstructured/staging/base.py in isd_to_elements(isd)
     75 def isd_to_elements(isd: List[Dict[str, Any]]) -> List[Element]:
     76     """Converts an Initial Structured Data (ISD) dictionary to a list of elements."""
---> 77     elements: List[Element] = []
     78 
     79     for item in isd:

/opt/anaconda3/lib/python3.8/site-packages/unstructured/documents/elements.py in from_dict(cls, input_dict)
    171     # Metadata extracted via regex
    172     regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
--> 173 
    174     # Detection Model Class Probabilities from Unstructured-Inference Hi-Res
    175     detection_class_prob: Optional[float] = None

TypeError: __init__() got an unexpected keyword argument 'parent_id'

Now it is an issue with 'parent_id'. I updated the Unstructured library as recommended too and this error also occurred there.

awalker4 commented 12 months ago

Thanks for the heads up. Can you double check that the update command took you to unstructured>=0.10.15? This should fix the problem. I make a pr against langchain to enforce this as the new minimum version.

Jimchoo91 commented 12 months ago

Thanks for replying, the version is 0.10.16.

awalker4 commented 12 months ago

Strange, that version should know about parent_id. In any case, I removed the new field from the api, which should resolve this. Let me know if you're still having issues through Langchain.

Jimchoo91 commented 12 months ago

Thanks @awalker4, I tried running my query again and now receive a new error - should I put this in a separate bug?

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-369-d7f06958231b> in <module>
      9 )
     10 
---> 11 chev_docs_1 = loader.load()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in load(self)
     84     def load(self) -> List[Document]:
     85         """Load file."""
---> 86         elements = self._get_elements()
     87         if self.mode == "elements":
     88             docs: List[Document] = list()

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in _get_elements(self)
    267 
    268     def _get_elements(self) -> List:
--> 269         return get_elements_from_api(
    270             file_path=self.file_path,
    271             api_key=self.api_key,

/opt/anaconda3/lib/python3.8/site-packages/langchain/document_loaders/unstructured.py in get_elements_from_api(file_path, file, api_url, api_key, **unstructured_kwargs)
    202         from unstructured.partition.api import partition_via_api
    203 
--> 204         return partition_via_api(
    205             filename=file_path,
    206             file=file,

/opt/anaconda3/lib/python3.8/site-packages/unstructured/partition/api.py in partition_via_api(filename, content_type, file, file_filename, api_url, api_key, **request_kwargs)
     88         files = [
     89             ("files", (metadata_filename, file, content_type)),  # type: ignore
---> 90         ]
     91         response = requests.post(
     92             api_url,

ValueError: Receive unexpected status code 504 from the API.
awalker4 commented 12 months ago

Yes please! That looks like a transient network error so be sure to retry and create a new issue if it persists.

Jimchoo91 commented 11 months ago

Hi @awalker4, I raised this a couple of weeks ago on the following link:

https://github.com/Unstructured-IO/unstructured-api/issues/255

I still see the problem, any idea what it may be? I have also received a new one 'ValueError: Receive unexpected status code 502 from the API.'