googleapis / python-documentai-toolbox

Document AI Toolbox is an SDK for Python that provides utility functions for managing, manipulating, and extracting information from the document response. It creates a "wrapped" document object from JSON files in Cloud Storage, local JSON files, or output directly from the Document AI API.
https://cloud.google.com/document-ai/docs/toolbox
Apache License 2.0
32 stars 13 forks source link

fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. #274

Closed holtskinner closed 6 months ago

holtskinner commented 6 months ago

Fixes #271 🦕

holtskinner commented 6 months ago

Error Message appeared in Tests before document.py commit was applied.

=================================== FAILURES ===================================
_______ test_quickstart_sample_batch_process_metadata_matching_prefixes ________

capsys = <_pytest.capture.CaptureFixture object at 0x7f3543707c50>

    def test_quickstart_sample_batch_process_metadata_matching_prefixes(
        capsys: pytest.CaptureFixture,
    ) -> None:
        batch_process_metadata = documentai.BatchProcessMetadata(
            state=documentai.BatchProcessMetadata.State.SUCCEEDED,
            individual_process_statuses=[
                documentai.BatchProcessMetadata.IndividualProcessStatus(
                    input_gcs_source="gs://test-directory/documentai/input.pdf",
                    output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/1",
                ),
                documentai.BatchProcessMetadata.IndividualProcessStatus(
                    input_gcs_source="gs://test-directory/documentai/input.pdf",
                    output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/11",
                ),
            ],
        )
        wrapped_document = quickstart_sample.quickstart_sample(
>           batch_process_metadata=batch_process_metadata
        )

test_quickstart_sample.py:116: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
quickstart_sample.py:80: in quickstart_sample
    metadata=batch_process_metadata
../../google/cloud/documentai_toolbox/wrappers/document.py:581: in from_batch_process_metadata
    for process in list(metadata.individual_process_statuses)
../../google/cloud/documentai_toolbox/wrappers/document.py:581: in 
    for process in list(metadata.individual_process_statuses)
../../google/cloud/documentai_toolbox/wrappers/document.py:507: in from_gcs
    shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

gcs_bucket_name = 'documentai_toolbox_samples'
gcs_prefix = 'output/matching-prefixes/1'

    def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Document]:
        r"""Returns a list of `documentai.Document` shards from a Cloud Storage folder.

        Args:
            gcs_bucket_name (str):
                Required. The name of the gcs bucket.

                Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`.
            gcs_prefix (str):
                Required. The prefix of the json files in the target_folder.

                Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
        Returns:
            List[google.cloud.documentai.Document]:
                A list of documentai.Documents.

        """
        file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix)
        if file_check is not None:
            raise ValueError("gcs_prefix cannot contain file types")

        byte_array = gcs_utilities.get_bytes(gcs_bucket_name, gcs_prefix)
        shards = [
            documentai.Document.from_json(byte, ignore_unknown_fields=True)
            for byte in byte_array
        ]

        if not shards:
            raise ValueError("Incomplete Document - No JSON files found.")

        total_shards = len(shards)

        if total_shards > 1:
            shards.sort(key=lambda x: int(x.shard_info.shard_index))

            for shard in shards:
                if int(shard.shard_info.shard_count) != total_shards:
                    raise ValueError(
>                       f"Invalid Document - shardInfo.shardCount ({shard.shard_info.shard_count}) does not match number of shards ({total_shards})."
                    )
E                   ValueError: Invalid Document - shardInfo.shardCount (1) does not match number of shards (6).

../../google/cloud/documentai_toolbox/wrappers/document.py:134: ValueError
-------- generated xml file: /workspace/samples/snippets/sponge_log.xml --------