langchain-ai / langchain

🦜🔗 Build context-aware reasoning applications
https://python.langchain.com
MIT License
95.23k stars 15.45k forks source link

pgvector - (psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes #26033

Open arindam-b opened 2 months ago

arindam-b commented 2 months ago

Checked other resources

Example Code

The following code failed:


vertexai.init(project=PROJECT_ID, location=REGION)

embedding_length = 768

# split the documents into chunks
text_splitter     = RecursiveCharacterTextSplitter(
    chunk_size    = 500,
    chunk_overlap = 100,
    separators    = ["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)

def get_connection_string() -> str:
    """
    Construct the database connection string
    for the PGVector vector database.

    returns:
        The PGSQL connection string.
    """

    CONNECTION_STRING = PGVector.connection_string_from_db_params(
        driver   = "psycopg",
        host     = PGVECTOR_DB_HOST,
        port     = PGVECTOR_DB_PORT,
        database = PGVECTOR_DATABASE,
        user     = PGVECTOR_DB_USER,
        password = PASSWORD,
    )

    return CONNECTION_STRING

def get_embeddings() -> ce:
    """
    This is an embedding function which is called to reference
    GCP's embedding model. It passes the arguments to run it in
    batch mode with a pause, so that the API will not run into
    an error.

    returns:
        CustomEmbedding instance.
    """

    # Embeddings API integrated with langChain
    EMBEDDING_QPM = 100
    EMBEDDING_NUM_BATCH = 5

    embeddings = ce.CustomVertexAIEmbeddings(
        requests_per_minute=EMBEDDING_QPM,
        num_instances_per_batch=EMBEDDING_NUM_BATCH,
        model_name="text-multilingual-embedding-002"
    )
    return embeddings

def get_pgvector(collection_name: str) -> PGVector:
    """
    The PGVector instance is returned from this function.
    The instance is dependent on the collection name.

    arg:
        collection_name: a string variable, which designates
        a suppplier.

    return:
        PGVector instance for the supplier based on collection_name.
    """

    vector_store = PGVector(
        embeddings = get_embeddings(),
        collection_name = collection_name,
        connection  = get_connection_string(),
        use_jsonb=True,
    )

    return vector_store

def delete_embeddings_from_vectordb(collection_name):
    print(f"Deleting embeddings from collection-{collection_name}")
    logging.info(f"Deleting embeddings from collection-{collection_name}")

    vector_store = .get_pgvector(collection_name)

    # Delete the collection from pgvector
    vector_store.delete_collection()
    logging.info("Embedding deleted.")

def add_embeddings_to_vectordb(document_splits, collection_name):

    print(f"Collection name-{collection_name}")
    logging.info(f"Collection name-{collection_name}")

    vector_store = get_pgvector(collection_name)

    vector_store.add_documents(documents=document_splits)
    print("Embedding added.")
    logging.info("Embedding added.")

def embed_document(collection_name: str, document_uri: str):
    """
    args:
     collection_name: a string represents the supplier name 
       which is stored as a PGVector collection in the database.

     document_uri: a string which is a storage path in a GCS bucket
        where the supplier documents are stored.

     is_full_embedding_needed: the embedding process for entire supplier
        prefix (folder) in GCS to be done or only the selected documents to be 
        embedded.

     document_list: list of individual documents to be embedded, if above flag
        is_full_embedding_needed is False

    """

    logging.info(f"Processing documents from {GCS_BUCKET} in a path {document_uri}/to-be-processed")

    loader = GCSDirectoryLoader(project_name=PROJECT_ID, bucket=GCS_BUCKET, \
                                prefix=f"{document_uri}/to-be-processed")

    documents = loader.load()

    doc_splits = text_splitter.split_documents(documents)

    # Add chunk number to metadata
    for idx, split in enumerate(doc_splits):
        split.metadata["chunk"] = idx
        split.metadata["id"] = idx

    logging.info(f"# of documents after the document split = {len(doc_splits)}")

    if len(doc_splits) > 0:
        add_embeddings_to_vectordb(document_splits=doc_splits, \
                        collection_name=collection_name)

        # Please ignore this, it is to move files from different prefixs in a blob
        move_files_in_gcs(source_folder=f"{document_uri}/to-be-processed", \
                                  destination_folder=f"{document_uri}/processed")
        return True, "OK"
    else:
        return False, "No documents found in the supplier folder"

Error Message and Stack Trace (if applicable)

2024-09-04 14:23:04,307 (psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes [SQL: INSERT INTO langchain_pg_embedding (id, collection_id, embedding, document, cmetadata) VALUES (%(id_m0)s::VARCHAR, %(collection_id_m0)s::UUID, %(embedding_m0)s, %(document_m0)s::VARCHAR, %(cmetadata_m0)s::JSONB), (%(id_m1)s::VARCHAR, %(collection_id_m1)s::UUID, %(embedding_m1)s, %(document_m1)s::VARCHAR, %(cmetadata_m1)s::JSONB), (%(id_m2)s::VARCHAR, %(collection_id_m2)s::UUID, %(embedding_m2)s, %(document_m2)s::VARCHAR, %(cmetadata_m2)s::JSONB), (%(id_m3)s::VARCHAR, %(collection_id_m3)s::UUID, %(embedding_m3)s, %(document_m3)s::VARCHAR, %(cmetadata_m3)s::JSONB), (%(id_m4)s::VARCHAR, %(collection_id_m4)s::UUID, %(embedding_m4)s, %(document_m4)s::VARCHAR, %(cmetadata_m4)s::JSONB), (%(id_m5)s::VARCHAR, %(collection_id_m5)s::UUID, %(embedding_m5)s, %(document_m5)s::VARCHAR, %(cmetadata_m5)s::JSONB), (%(id_m6)s::VARCHAR, %(collection_id_m6)s::UUID, %(embedding_m6)s, %(document_m6)s::VARCHAR, %(cmetadata_m6)s::JSONB), (%(id_m7)s::VARCHAR, %(collection_id_m7)s::UUID, %(embedding_m7)s, %(document_m7)s::VARCHAR, %(cmetadata_m7)s::JSONB), (%(id_m8)s::VARCHAR, %(collection_id_m8)s::UUID, %(embedding_m8)s, %(document_m8)s::VARCHAR, %(cmetadata_m8)s::JSONB), (%(id_m9)s::VARCHAR, %(collection_id_m9)s::UUID, %(embedding_m9)s, %(document_m9)s::VARCHAR, %(cmetadata_m9)s::JSONB), (%(id_m10)s::VARCHAR, %(collection_id_m10)s::UUID, %(embedding_m10)s, %(document_m10)s::VARCHAR, %(cmetadata_m10)s::JSONB), (%(id_m11)s::VARCHAR, %(collection_id_m11)s::UUID, %(embedding_m11)s, %(document_m11)s::VARCHAR, %(cmetadata_m11)s::JSONB), (%(id_m12)s::VARCHAR, %(collection_id_m12)s::UUID, %(embedding_m12)s, %(document_m12)s::VARCHAR, %(cmetadata_m12)s::JSONB), (%(id_m13)s::VARCHAR, %(collection_id_m13)s::UUID, %(embedding_m13)s, %(document_m13)s::VARCHAR, %(cmetadata_m13)s::JSONB), (%(id_m14)s::VARCHAR, %(collection_id_m14)s::UUID, %(embedding_m14)s, %(document_m14)s::VARCHAR, %(cmetadata_m14)s::JSONB) ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata] [parameters: {'id_m0': '9c794529-2534-457d-b09e-120564e0203b', 'collection_id_m0': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m0': '[-0.023259006440639496,-0.026278316974639893,0.010832197964191437,0.02110976167023182,0.03138941153883934,0.010138949379324913,0.042534541338682175,0 ... (15922 characters truncated) ... 137787,-0.0432162843644619,0.0278224665671587,0.07601999491453171,-0.02350415289402008,0.01278616115450859,-0.022451436147093773,0.01470776554197073]', 'document_m0': 'Startup School: Gen AI - list of recommended labs and notebooks\n\nClass\n\nLabs covered\n\nNotebooks covered\n\nLabs can be completed in Cloud Skills Boost pla\x00orm, more instructions here', 'cmetadata_m0': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m1': '21c03067-1e96-4e0e-a61e-548b0c3c4c3b', 'collection_id_m1': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m1': '[-0.06288693845272064,-0.017322123050689697,0.00843889731913805,0.012165053747594357,0.059546373784542084,0.04053519293665886,0.0543924979865551,0.04 ... (15926 characters truncated) ... -0.08310684561729431,-0.004060924984514713,0.0043006762862205505,0.004421140532940626,0.03354359790682793,-0.05268661677837372,-0.009564831852912903]', 'document_m1': 'Notebooks can only be run using your own Cloud environment, more instructions here\n\n1 Current state of Generative AI Ge\x00ing Started with the Vertex AI Gemini API and Python SDK\n\nn/a\n\nMultimodality with Gemini', 'cmetadata_m1': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m2': '60f291bb-a829-47d5-b5b8-23029b5926cf', 'collection_id_m2': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m2': '[-0.015607654117047787,0.0012050960212945938,0.04723281413316727,0.02288687787950039,0.08967798203229904,0.04483583942055702,0.047026704996824265,0.0 ... (15890 characters truncated) ... .09412500262260437,-0.010505995713174343,0.043766316026449203,-0.0004982317914254963,0.020516254007816315,-0.01597626507282257,-0.009379896335303783]', 'document_m2': 'n/a\n\nMultimodality with Gemini\n\nApplications of Generative AI for your business\n\nIntroduction to Generative AI Learning Path\n\nn/a\n\n2 Exploring prompt engineering\n\nn/a', 'cmetadata_m2': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m3': 'fd9271fe-9f2d-4416-a050-b01dfcfa7d40', 'collection_id_m3': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m3': '[0.01780903898179531,-0.004708666820079088,0.014872205443680286,-0.04326470196247101,0.10042023658752441,0.02812151610851288,0.06731176376342773,0.01 ... (15939 characters truncated) ... -0.09371116012334824,-0.039024095982313156,0.03582580015063286,-0.027630938217043877,0.016092879697680473,0.0015013794181868434,0.002231260761618614]', 'document_m3': 'n/a\n\n2 Exploring prompt engineering\n\nn/a\n\nNotebook: Intro Gemini Notebook: Chain of Thought & React Notebook: Safety ratings & thresholds\n\nImage generation, editing, custom styling and beyond\n\nn/a', 'cmetadata_m3': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m4': 'e2e529a5-1fae-4388-9c97-fa13dd3098e1', 'collection_id_m4': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m4': '[-0.01722259446978569,0.039151113480329514,0.042904406785964966,-0.00693162064999342,0.01672654040157795,0.056566305458545685,0.06780373305082321,0.0 ... (15941 characters truncated) ... 07887300848960876,-0.0015596754383295774,-0.012516054324805737,-0.003459786996245384,0.0001272123772650957,0.002018331317231059,0.007937485352158546]', 'document_m4': 'n/a\n\nNotebook: Create High Quality Visual Assets with Imagen and Gemini\n\n3 Embeddings, vector databases\n\nn/a\n\nNotebook: Ge\x00ing Started with Text Embeddings + Vertex AI Vector Search\n\nCode generation, completion, chat\n\nn/a', 'cmetadata_m4': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m5': '76bb710f-a128-4ceb-adbb-cc63db6d6df8', 'collection_id_m5': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m5': '[-0.01236398983746767,0.004715809132903814,0.01416818704456091,-0.004159413278102875,0.007086075376719236,0.05418580397963524,0.07686776667833328,0.0 ... (15892 characters truncated) ... ,-0.04131390154361725,0.03247188776731491,-0.007179936859756708,-0.011165671981871128,0.029996506869792938,-0.024639440700411797,0.02816704846918583]', 'document_m5': 'Code generation, completion, chat\n\nn/a\n\nNotebooks: Code Generation\n\n4 Intro to RAG architectures, including Vertex AI Search\n\nIntegrate Search in Applications using Vertex AI Search', 'cmetadata_m5': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m6': '14394728-ac6d-4660-bb6f-6eb49b534411', 'collection_id_m6': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m6': '[-0.027659673243761063,0.0006991660920903087,0.025508370250463486,0.01155843771994114,0.029794376343488693,0.07038003206253052,0.10488440841436386,0. ... (15913 characters truncated) ... -0.05645660310983658,0.03617023304104805,0.006507235113531351,-0.004400421399623156,0.019326118752360344,-0.026365745812654495,0.0001150920579675585]', 'document_m6': 'Notebook: Multimodal Retrieval Augmented Generation (RAG) using Vertex AI Gemini API\n\nBuilding enterprise chat apps using GenAI\n\nn/a', 'cmetadata_m6': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m7': '4bdd26b1-7890-45df-afb4-be77f4a06af2', 'collection_id_m7': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m7': '[-0.010382091626524925,-0.026589123532176018,0.005429613869637251,-0.035534538328647614,-0.04687759280204773,0.05812354385852814,0.023599425330758095 ... (15865 characters truncated) ... 13,-0.030380338430404663,0.0339222326874733,0.021654268726706505,-0.01042112335562706,0.024006597697734833,-0.011356944218277931,0.05818868428468704]', 'document_m7': 'n/a\n\nCodelab: Create a Generative Chat App with Vertex AI Conversation Codelab: Increase intent coverage and handle errors gracefully with generative fallback Codelab: Informed decision making using Dialog\x00ow CX generators and data stores', 'cmetadata_m7': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m8': 'b917689d-0235-4023-8d59-fec37bfc0deb', 'collection_id_m8': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m8': '[-0.016646429896354675,-0.035836346447467804,-0.0057340324856340885,0.028716551139950752,0.025854725390672684,0.033178601413965225,0.0703235790133476 ... (15958 characters truncated) ... -0.09207435697317123,0.0035694115795195103,-0.017135247588157654,0.002087848959490657,0.04181625321507454,-0.04720042273402214,-0.003726722439751029]', 'document_m8': '5 Deploying and hosting apps in\n\nthe cloud\n\nn/a\n\nDemo App GitHub Repository - sample applications\n\nTuning & RLHF\n\nNotebook: Tuning and deploy a foundation model Notebook: Vertex AI LLM Reinforcement Learning from Human Feedback\n\n6 MLOps for Gen AI', 'cmetadata_m8': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m9': 'd85dfdfa-e30a-40f3-9b1f-725616174e1b', 'collection_id_m9': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m9': '[-0.05191882327198982,-0.024967215955257416,0.024289220571517944,0.02554929070174694,-0.0067266556434333324,0.015424377284944057,0.044294703751802444 ... (15864 characters truncated) ... 4,-0.08642246574163437,0.014378088526427746,0.037835948169231415,-0.02229861356317997,0.022061794996261597,-0.03275573253631592,0.010878358036279678]', 'document_m9': '6 MLOps for Gen AI\n\nn/a\n\nBlogpost Notebook: Evaluate LLMs with AutoSxS Model Eval\n\nApplication Development with Duet AI\n\nVisit this doc at goo.gle/GenAI-Labs each week to discover new recommended labs\n\nNotebook Instructions', 'cmetadata_m9': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m10': '7e016f25-1390-4818-9f63-3ef0a0622554', 'collection_id_m10': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m10': '[-0.005956327077001333,-0.03804914280772209,-0.007826329208910465,0.005197994410991669,0.0162068922072649,0.039616815745830536,0.04923781752586365,0. ... (15909 characters truncated) ... -0.055082451552152634,0.004437738563865423,0.026748552918434143,-0.001225739368237555,0.035878125578165054,-0.04693932458758354,0.004400981590151787]', 'document_m10': 'If our speakers cover a Notebook in class, you’ll need to use Google Colab or AI Vertex Workbench to run these, which will require you to use your own Cloud Console', 'cmetadata_m10': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m11': 'd0fa5c3c-307e-4e97-ab8b-9fd773df2e95', 'collection_id_m11': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m11': '[-0.01988862454891205,-0.028791887685656548,0.007235800847411156,0.022267987951636314,0.04042388126254082,0.045118965208530426,0.04622763767838478,0. ... (15919 characters truncated) ... 7,-0.08161881566047668,-0.03470646217465401,0.05456053465604782,-0.0017264139605686069,0.013938860036432743,-0.03372414410114288,0.03529150411486626]', 'document_m11': '. This may have billable components, however we have a Free Trial with $300 in credits or our Cloud Program for startups which both o\x00er Cloud credits that you can use to run these Notebooks.', 'cmetadata_m11': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m12': '4ff49daf-cd41-430e-a7d3-7e23f4f93b65', 'collection_id_m12': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m12': '[-0.004883579909801483,-0.028143372386693954,0.010797201655805111,0.04843907803297043,0.09141760319471359,0.012636261060833931,0.051852189004421234,0 ... (15910 characters truncated) ... 595,-0.05188891664147377,-0.01267238799482584,0.03661508113145828,-0.02903173863887787,0.02557116188108921,-0.06374701857566833,0.014439685270190239]', 'document_m12': 'This link should help you set up your \x00rst Google Cloud Project and set up an environment for Notebook.\n\nOur GitHub repository for GenAI notebooks is available here.\n\nLabs Instructions', 'cmetadata_m12': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m13': '296e44ee-4062-4fa2-806d-76a22600072b', 'collection_id_m13': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m13': '[0.016365613788366318,0.012480733916163445,0.008040445856750011,0.045174866914749146,0.03872095048427582,0.046330638229846954,-0.02483428083360195,0. ... (15916 characters truncated) ... 0.07008633762598038,-0.032697923481464386,0.04701327532529831,0.009774026460945606,-0.010585346259176731,-0.014868056401610374,0.0033125909976661205]', 'document_m13': 'Remember to follow these steps to redeem your credits in Cloud Skills Boost. Paste this link when you are prompted for using a speci\x00c URL (and remember about Incognito Mode): h\x00ps://www.cloudskillsboost', 'cmetadata_m13': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m14': '8d4144f7-7bf6-49a3-a02b-70b4f5d0a1fc', 'collection_id_m14': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m14': '[0.001917202607728541,-0.03328728675842285,0.04176468402147293,0.022659817710518837,0.00808507390320301,0.01894487254321575,0.0740545243024826,0.0296 ... (15886 characters truncated) ... 0112,-0.046015415340662,0.009823916479945183,0.06310032308101654,0.02141757868230343,0.0055993665009737015,-0.03736981377005577,0.058463599532842636]', 'document_m14': '.cloudskillsboost.google/catalog_lab/1281?qlcampaign=1b-strsc-90', 'cmetadata_m14': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars))}] (Background on this error at: https://sqlalche.me/e/20/9h9h)

Description

System Info

Libraries used:

aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.4.0
attrs==24.2.0
backoff==2.2.1
beautifulsoup4==4.12.3
CacheControl==0.14.0
cachetools==5.5.0
certifi==2024.7.4
cffi==1.17.0
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.3.0
cryptography==43.0.0
cycler==0.12.1
dataclasses-json==0.6.7
deepdiff==8.0.0
Deprecated==1.2.14
docstring_parser==0.16
effdet==0.4.1
emoji==2.12.1
et-xmlfile==1.1.0
filelock==3.15.4
filetype==1.2.0
firebase-admin==6.5.0
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.6.1
google-api-core==2.19.2
google-api-python-client==2.142.0
google-auth==2.34.0
google-auth-httplib2==0.2.0
google-cloud-aiplatform==1.64.0
google-cloud-bigquery==3.25.0
google-cloud-core==2.4.1
google-cloud-firestore==2.18.0
google-cloud-pubsub==2.23.0
google-cloud-resource-manager==1.12.5
google-cloud-secret-manager==2.20.2
google-cloud-storage==2.18.2
google-cloud-vision==3.7.4
google-crc32c==1.5.0
google-resumable-media==2.7.2
googleapis-common-protos==1.65.0
greenlet==3.0.3
grpc-google-iam-v1==0.13.1
grpcio==1.66.0
grpcio-status==1.66.0
h11==0.14.0
httpcore==1.0.5
httplib2==0.22.0
httpx==0.27.2
httpx-sse==0.4.0
huggingface-hub==0.24.6
humanfriendly==10.0
idna==3.8
iopath==0.1.10
Jinja2==3.1.4
joblib==1.4.2
jsonpatch==1.33
jsonpath-python==1.0.6
jsonpointer==3.0.0
kiwisolver==1.4.5
langchain==0.2.15
langchain-community==0.2.13
langchain-core==0.2.35
langchain-google-community==1.0.8
langchain-google-vertexai==1.0.10
langchain-postgres==0.0.9
langchain-text-splitters==0.2.2
langdetect==1.0.9
langsmith==0.1.106
layoutparser==0.3.4
lxml==5.3.0
MarkupSafe==2.1.5
marshmallow==3.22.0
matplotlib==3.9.2
mpmath==1.3.0
msgpack==1.0.8
multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
networkx==3.3
nltk==3.9.1
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.20
nvidia-nvtx-cu12==12.1.105
omegaconf==2.3.0
onnx==1.16.2
onnxruntime==1.19.0
opencv-contrib-python==4.10.0.84
opencv-python==4.10.0.84
openpyxl==3.1.5
orderly-set==5.2.1
orjson==3.10.7
packaging==24.1
pandas==2.2.2
pdf2image==1.17.0
pdfminer.six==20231228
pdfplumber==0.11.4
pgvector==0.2.5
pi_heif==0.18.0
pikepdf==9.2.0
pillow==10.4.0
poppler-utils==0.1.0
portalocker==2.10.1
proto-plus==1.24.0
protobuf==5.27.4
psutil==6.0.0
psycopg==3.2.1
psycopg-binary==3.2.1
psycopg-pool==3.2.2
pyasn1==0.6.0
pyasn1_modules==0.4.0
pycocotools==2.0.8
pycparser==2.22
pydantic==2.8.2
pydantic_core==2.20.1
PyJWT==2.9.0
pyparsing==3.1.4
pypdf==4.3.1
PyPDF2==3.0.1
pypdfium2==4.30.0
pytesseract==0.3.13
python-dateutil==2.9.0.post0
python-docx==1.1.2
python-iso639==2024.4.27
python-magic==0.4.27
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.2
rapidfuzz==3.9.6
regex==2024.7.24
requests==2.32.3
requests-toolbelt==1.0.0
rsa==4.9
safetensors==0.4.4
scipy==1.14.1
setuptools==74.0.0
shapely==2.0.6
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
SQLAlchemy==2.0.32
sympy==1.13.2
tabulate==0.9.0
tenacity==8.3.0
timm==1.0.9
tokenizers==0.19.1
torch==2.4.0
torchvision==0.19.0
tqdm==4.66.5
transformers==4.44.2
triton==3.0.0
typing-inspect==0.9.0
typing_extensions==4.12.2
tzdata==2024.1
unstructured==0.15.8
unstructured-client==0.25.5
unstructured-inference==0.7.36
unstructured.pytesseract==0.3.13
uritemplate==4.1.1
urllib3==2.2.2
wrapt==1.16.0
yarl==1.9.4

Ubuntu machine following packages installed:


apt-get install python3-opencv
apt-get reinstall pkgconf-bin
apt-get install pkg-config
apt-get install -y poppler-utils
m0han22 commented 3 weeks ago

I am also facing the same issue, any update on this one.