pgvector - (psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes

Checked other resources

[X] I added a very descriptive title to this issue.
[X] I searched the LangChain documentation with the integrated search.
[X] I used the GitHub search to find a similar question and didn't find it.
[X] I am sure that this is a bug in LangChain rather than my code.
[X] The bug is not resolved by updating to the latest stable version of LangChain (or the specific integration package).

Example Code

The following code failed:


vertexai.init(project=PROJECT_ID, location=REGION)

embedding_length = 768

# split the documents into chunks
text_splitter     = RecursiveCharacterTextSplitter(
    chunk_size    = 500,
    chunk_overlap = 100,
    separators    = ["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)

def get_connection_string() -> str:
    """
    Construct the database connection string
    for the PGVector vector database.

    returns:
        The PGSQL connection string.
    """

    CONNECTION_STRING = PGVector.connection_string_from_db_params(
        driver   = "psycopg",
        host     = PGVECTOR_DB_HOST,
        port     = PGVECTOR_DB_PORT,
        database = PGVECTOR_DATABASE,
        user     = PGVECTOR_DB_USER,
        password = PASSWORD,
    )

    return CONNECTION_STRING

def get_embeddings() -> ce:
    """
    This is an embedding function which is called to reference
    GCP's embedding model. It passes the arguments to run it in
    batch mode with a pause, so that the API will not run into
    an error.

    returns:
        CustomEmbedding instance.
    """

    # Embeddings API integrated with langChain
    EMBEDDING_QPM = 100
    EMBEDDING_NUM_BATCH = 5

    embeddings = ce.CustomVertexAIEmbeddings(
        requests_per_minute=EMBEDDING_QPM,
        num_instances_per_batch=EMBEDDING_NUM_BATCH,
        model_name="text-multilingual-embedding-002"
    )
    return embeddings

def get_pgvector(collection_name: str) -> PGVector:
    """
    The PGVector instance is returned from this function.
    The instance is dependent on the collection name.

    arg:
        collection_name: a string variable, which designates
        a suppplier.

    return:
        PGVector instance for the supplier based on collection_name.
    """

    vector_store = PGVector(
        embeddings = get_embeddings(),
        collection_name = collection_name,
        connection  = get_connection_string(),
        use_jsonb=True,
    )

    return vector_store

def delete_embeddings_from_vectordb(collection_name):
    print(f"Deleting embeddings from collection-{collection_name}")
    logging.info(f"Deleting embeddings from collection-{collection_name}")

    vector_store = .get_pgvector(collection_name)

    # Delete the collection from pgvector
    vector_store.delete_collection()
    logging.info("Embedding deleted.")

def add_embeddings_to_vectordb(document_splits, collection_name):

    print(f"Collection name-{collection_name}")
    logging.info(f"Collection name-{collection_name}")

    vector_store = get_pgvector(collection_name)

    vector_store.add_documents(documents=document_splits)
    print("Embedding added.")
    logging.info("Embedding added.")

def embed_document(collection_name: str, document_uri: str):
    """
    args:
     collection_name: a string represents the supplier name 
       which is stored as a PGVector collection in the database.

     document_uri: a string which is a storage path in a GCS bucket
        where the supplier documents are stored.

     is_full_embedding_needed: the embedding process for entire supplier
        prefix (folder) in GCS to be done or only the selected documents to be 
        embedded.

     document_list: list of individual documents to be embedded, if above flag
        is_full_embedding_needed is False

    """

    logging.info(f"Processing documents from {GCS_BUCKET} in a path {document_uri}/to-be-processed")

    loader = GCSDirectoryLoader(project_name=PROJECT_ID, bucket=GCS_BUCKET, \
                                prefix=f"{document_uri}/to-be-processed")

    documents = loader.load()

    doc_splits = text_splitter.split_documents(documents)

    # Add chunk number to metadata
    for idx, split in enumerate(doc_splits):
        split.metadata["chunk"] = idx
        split.metadata["id"] = idx

    logging.info(f"# of documents after the document split = {len(doc_splits)}")

    if len(doc_splits) > 0:
        add_embeddings_to_vectordb(document_splits=doc_splits, \
                        collection_name=collection_name)

        # Please ignore this, it is to move files from different prefixs in a blob
        move_files_in_gcs(source_folder=f"{document_uri}/to-be-processed", \
                                  destination_folder=f"{document_uri}/processed")
        return True, "OK"
    else:
        return False, "No documents found in the supplier folder"

Error Message and Stack Trace (if applicable)

2024-09-04 14:23:04,307 (psycopg.DataError) PostgreSQL text fields cannot contain NUL (0x00) bytes [SQL: INSERT INTO langchain_pg_embedding (id, collection_id, embedding, document, cmetadata) VALUES (%(id_m0)s::VARCHAR, %(collection_id_m0)s::UUID, %(embedding_m0)s, %(document_m0)s::VARCHAR, %(cmetadata_m0)s::JSONB), (%(id_m1)s::VARCHAR, %(collection_id_m1)s::UUID, %(embedding_m1)s, %(document_m1)s::VARCHAR, %(cmetadata_m1)s::JSONB), (%(id_m2)s::VARCHAR, %(collection_id_m2)s::UUID, %(embedding_m2)s, %(document_m2)s::VARCHAR, %(cmetadata_m2)s::JSONB), (%(id_m3)s::VARCHAR, %(collection_id_m3)s::UUID, %(embedding_m3)s, %(document_m3)s::VARCHAR, %(cmetadata_m3)s::JSONB), (%(id_m4)s::VARCHAR, %(collection_id_m4)s::UUID, %(embedding_m4)s, %(document_m4)s::VARCHAR, %(cmetadata_m4)s::JSONB), (%(id_m5)s::VARCHAR, %(collection_id_m5)s::UUID, %(embedding_m5)s, %(document_m5)s::VARCHAR, %(cmetadata_m5)s::JSONB), (%(id_m6)s::VARCHAR, %(collection_id_m6)s::UUID, %(embedding_m6)s, %(document_m6)s::VARCHAR, %(cmetadata_m6)s::JSONB), (%(id_m7)s::VARCHAR, %(collection_id_m7)s::UUID, %(embedding_m7)s, %(document_m7)s::VARCHAR, %(cmetadata_m7)s::JSONB), (%(id_m8)s::VARCHAR, %(collection_id_m8)s::UUID, %(embedding_m8)s, %(document_m8)s::VARCHAR, %(cmetadata_m8)s::JSONB), (%(id_m9)s::VARCHAR, %(collection_id_m9)s::UUID, %(embedding_m9)s, %(document_m9)s::VARCHAR, %(cmetadata_m9)s::JSONB), (%(id_m10)s::VARCHAR, %(collection_id_m10)s::UUID, %(embedding_m10)s, %(document_m10)s::VARCHAR, %(cmetadata_m10)s::JSONB), (%(id_m11)s::VARCHAR, %(collection_id_m11)s::UUID, %(embedding_m11)s, %(document_m11)s::VARCHAR, %(cmetadata_m11)s::JSONB), (%(id_m12)s::VARCHAR, %(collection_id_m12)s::UUID, %(embedding_m12)s, %(document_m12)s::VARCHAR, %(cmetadata_m12)s::JSONB), (%(id_m13)s::VARCHAR, %(collection_id_m13)s::UUID, %(embedding_m13)s, %(document_m13)s::VARCHAR, %(cmetadata_m13)s::JSONB), (%(id_m14)s::VARCHAR, %(collection_id_m14)s::UUID, %(embedding_m14)s, %(document_m14)s::VARCHAR, %(cmetadata_m14)s::JSONB) ON CONFLICT (id) DO UPDATE SET embedding = excluded.embedding, document = excluded.document, cmetadata = excluded.cmetadata] [parameters: {'id_m0': '9c794529-2534-457d-b09e-120564e0203b', 'collection_id_m0': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m0': '[-0.023259006440639496,-0.026278316974639893,0.010832197964191437,0.02110976167023182,0.03138941153883934,0.010138949379324913,0.042534541338682175,0 ... (15922 characters truncated) ... 137787,-0.0432162843644619,0.0278224665671587,0.07601999491453171,-0.02350415289402008,0.01278616115450859,-0.022451436147093773,0.01470776554197073]', 'document_m0': 'Startup School: Gen AI - list of recommended labs and notebooks\n\nClass\n\nLabs covered\n\nNotebooks covered\n\nLabs can be completed in Cloud Skills Boost pla\x00orm, more instructions here', 'cmetadata_m0': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m1': '21c03067-1e96-4e0e-a61e-548b0c3c4c3b', 'collection_id_m1': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m1': '[-0.06288693845272064,-0.017322123050689697,0.00843889731913805,0.012165053747594357,0.059546373784542084,0.04053519293665886,0.0543924979865551,0.04 ... (15926 characters truncated) ... -0.08310684561729431,-0.004060924984514713,0.0043006762862205505,0.004421140532940626,0.03354359790682793,-0.05268661677837372,-0.009564831852912903]', 'document_m1': 'Notebooks can only be run using your own Cloud environment, more instructions here\n\n1 Current state of Generative AI Ge\x00ing Started with the Vertex AI Gemini API and Python SDK\n\nn/a\n\nMultimodality with Gemini', 'cmetadata_m1': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m2': '60f291bb-a829-47d5-b5b8-23029b5926cf', 'collection_id_m2': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m2': '[-0.015607654117047787,0.0012050960212945938,0.04723281413316727,0.02288687787950039,0.08967798203229904,0.04483583942055702,0.047026704996824265,0.0 ... (15890 characters truncated) ... .09412500262260437,-0.010505995713174343,0.043766316026449203,-0.0004982317914254963,0.020516254007816315,-0.01597626507282257,-0.009379896335303783]', 'document_m2': 'n/a\n\nMultimodality with Gemini\n\nApplications of Generative AI for your business\n\nIntroduction to Generative AI Learning Path\n\nn/a\n\n2 Exploring prompt engineering\n\nn/a', 'cmetadata_m2': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m3': 'fd9271fe-9f2d-4416-a050-b01dfcfa7d40', 'collection_id_m3': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m3': '[0.01780903898179531,-0.004708666820079088,0.014872205443680286,-0.04326470196247101,0.10042023658752441,0.02812151610851288,0.06731176376342773,0.01 ... (15939 characters truncated) ... -0.09371116012334824,-0.039024095982313156,0.03582580015063286,-0.027630938217043877,0.016092879697680473,0.0015013794181868434,0.002231260761618614]', 'document_m3': 'n/a\n\n2 Exploring prompt engineering\n\nn/a\n\nNotebook: Intro Gemini Notebook: Chain of Thought & React Notebook: Safety ratings & thresholds\n\nImage generation, editing, custom styling and beyond\n\nn/a', 'cmetadata_m3': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m4': 'e2e529a5-1fae-4388-9c97-fa13dd3098e1', 'collection_id_m4': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m4': '[-0.01722259446978569,0.039151113480329514,0.042904406785964966,-0.00693162064999342,0.01672654040157795,0.056566305458545685,0.06780373305082321,0.0 ... (15941 characters truncated) ... 07887300848960876,-0.0015596754383295774,-0.012516054324805737,-0.003459786996245384,0.0001272123772650957,0.002018331317231059,0.007937485352158546]', 'document_m4': 'n/a\n\nNotebook: Create High Quality Visual Assets with Imagen and Gemini\n\n3 Embeddings, vector databases\n\nn/a\n\nNotebook: Ge\x00ing Started with Text Embeddings + Vertex AI Vector Search\n\nCode generation, completion, chat\n\nn/a', 'cmetadata_m4': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m5': '76bb710f-a128-4ceb-adbb-cc63db6d6df8', 'collection_id_m5': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m5': '[-0.01236398983746767,0.004715809132903814,0.01416818704456091,-0.004159413278102875,0.007086075376719236,0.05418580397963524,0.07686776667833328,0.0 ... (15892 characters truncated) ... ,-0.04131390154361725,0.03247188776731491,-0.007179936859756708,-0.011165671981871128,0.029996506869792938,-0.024639440700411797,0.02816704846918583]', 'document_m5': 'Code generation, completion, chat\n\nn/a\n\nNotebooks: Code Generation\n\n4 Intro to RAG architectures, including Vertex AI Search\n\nIntegrate Search in Applications using Vertex AI Search', 'cmetadata_m5': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m6': '14394728-ac6d-4660-bb6f-6eb49b534411', 'collection_id_m6': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m6': '[-0.027659673243761063,0.0006991660920903087,0.025508370250463486,0.01155843771994114,0.029794376343488693,0.07038003206253052,0.10488440841436386,0. ... (15913 characters truncated) ... -0.05645660310983658,0.03617023304104805,0.006507235113531351,-0.004400421399623156,0.019326118752360344,-0.026365745812654495,0.0001150920579675585]', 'document_m6': 'Notebook: Multimodal Retrieval Augmented Generation (RAG) using Vertex AI Gemini API\n\nBuilding enterprise chat apps using GenAI\n\nn/a', 'cmetadata_m6': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m7': '4bdd26b1-7890-45df-afb4-be77f4a06af2', 'collection_id_m7': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m7': '[-0.010382091626524925,-0.026589123532176018,0.005429613869637251,-0.035534538328647614,-0.04687759280204773,0.05812354385852814,0.023599425330758095 ... (15865 characters truncated) ... 13,-0.030380338430404663,0.0339222326874733,0.021654268726706505,-0.01042112335562706,0.024006597697734833,-0.011356944218277931,0.05818868428468704]', 'document_m7': 'n/a\n\nCodelab: Create a Generative Chat App with Vertex AI Conversation Codelab: Increase intent coverage and handle errors gracefully with generative fallback Codelab: Informed decision making using Dialog\x00ow CX generators and data stores', 'cmetadata_m7': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m8': 'b917689d-0235-4023-8d59-fec37bfc0deb', 'collection_id_m8': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m8': '[-0.016646429896354675,-0.035836346447467804,-0.0057340324856340885,0.028716551139950752,0.025854725390672684,0.033178601413965225,0.0703235790133476 ... (15958 characters truncated) ... -0.09207435697317123,0.0035694115795195103,-0.017135247588157654,0.002087848959490657,0.04181625321507454,-0.04720042273402214,-0.003726722439751029]', 'document_m8': '5 Deploying and hosting apps in\n\nthe cloud\n\nn/a\n\nDemo App GitHub Repository - sample applications\n\nTuning & RLHF\n\nNotebook: Tuning and deploy a foundation model Notebook: Vertex AI LLM Reinforcement Learning from Human Feedback\n\n6 MLOps for Gen AI', 'cmetadata_m8': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m9': 'd85dfdfa-e30a-40f3-9b1f-725616174e1b', 'collection_id_m9': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m9': '[-0.05191882327198982,-0.024967215955257416,0.024289220571517944,0.02554929070174694,-0.0067266556434333324,0.015424377284944057,0.044294703751802444 ... (15864 characters truncated) ... 4,-0.08642246574163437,0.014378088526427746,0.037835948169231415,-0.02229861356317997,0.022061794996261597,-0.03275573253631592,0.010878358036279678]', 'document_m9': '6 MLOps for Gen AI\n\nn/a\n\nBlogpost Notebook: Evaluate LLMs with AutoSxS Model Eval\n\nApplication Development with Duet AI\n\nVisit this doc at goo.gle/GenAI-Labs each week to discover new recommended labs\n\nNotebook Instructions', 'cmetadata_m9': Jsonb({'source': 'gs://my-prod-bucket-s ... (135 chars)), 'id_m10': '7e016f25-1390-4818-9f63-3ef0a0622554', 'collection_id_m10': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m10': '[-0.005956327077001333,-0.03804914280772209,-0.007826329208910465,0.005197994410991669,0.0162068922072649,0.039616815745830536,0.04923781752586365,0. ... (15909 characters truncated) ... -0.055082451552152634,0.004437738563865423,0.026748552918434143,-0.001225739368237555,0.035878125578165054,-0.04693932458758354,0.004400981590151787]', 'document_m10': 'If our speakers cover a Notebook in class, you’ll need to use Google Colab or AI Vertex Workbench to run these, which will require you to use your own Cloud Console', 'cmetadata_m10': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m11': 'd0fa5c3c-307e-4e97-ab8b-9fd773df2e95', 'collection_id_m11': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m11': '[-0.01988862454891205,-0.028791887685656548,0.007235800847411156,0.022267987951636314,0.04042388126254082,0.045118965208530426,0.04622763767838478,0. ... (15919 characters truncated) ... 7,-0.08161881566047668,-0.03470646217465401,0.05456053465604782,-0.0017264139605686069,0.013938860036432743,-0.03372414410114288,0.03529150411486626]', 'document_m11': '. This may have billable components, however we have a Free Trial with $300 in credits or our Cloud Program for startups which both o\x00er Cloud credits that you can use to run these Notebooks.', 'cmetadata_m11': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m12': '4ff49daf-cd41-430e-a7d3-7e23f4f93b65', 'collection_id_m12': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m12': '[-0.004883579909801483,-0.028143372386693954,0.010797201655805111,0.04843907803297043,0.09141760319471359,0.012636261060833931,0.051852189004421234,0 ... (15910 characters truncated) ... 595,-0.05188891664147377,-0.01267238799482584,0.03661508113145828,-0.02903173863887787,0.02557116188108921,-0.06374701857566833,0.014439685270190239]', 'document_m12': 'This link should help you set up your \x00rst Google Cloud Project and set up an environment for Notebook.\n\nOur GitHub repository for GenAI notebooks is available here.\n\nLabs Instructions', 'cmetadata_m12': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m13': '296e44ee-4062-4fa2-806d-76a22600072b', 'collection_id_m13': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m13': '[0.016365613788366318,0.012480733916163445,0.008040445856750011,0.045174866914749146,0.03872095048427582,0.046330638229846954,-0.02483428083360195,0. ... (15916 characters truncated) ... 0.07008633762598038,-0.032697923481464386,0.04701327532529831,0.009774026460945606,-0.010585346259176731,-0.014868056401610374,0.0033125909976661205]', 'document_m13': 'Remember to follow these steps to redeem your credits in Cloud Skills Boost. Paste this link when you are prompted for using a speci\x00c URL (and remember about Incognito Mode): h\x00ps://www.cloudskillsboost', 'cmetadata_m13': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars)), 'id_m14': '8d4144f7-7bf6-49a3-a02b-70b4f5d0a1fc', 'collection_id_m14': UUID('e9acec67-6afd-45dd-9999-509381ee1e22'), 'embedding_m14': '[0.001917202607728541,-0.03328728675842285,0.04176468402147293,0.022659817710518837,0.00808507390320301,0.01894487254321575,0.0740545243024826,0.0296 ... (15886 characters truncated) ... 0112,-0.046015415340662,0.009823916479945183,0.06310032308101654,0.02141757868230343,0.0055993665009737015,-0.03736981377005577,0.058463599532842636]', 'document_m14': '.cloudskillsboost.google/catalog_lab/1281?qlcampaign=1b-strsc-90', 'cmetadata_m14': Jsonb({'source': 'gs://my-prod-bucket-s ... (137 chars))}] (Background on this error at: https://sqlalche.me/e/20/9h9h)

Description

it works fine for most of the cases. But for some documents it throws error. As you can see embedding generation happened properly but it failed to insert to the database.

System Info

Libraries used:

aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.4.0
attrs==24.2.0
backoff==2.2.1
beautifulsoup4==4.12.3
CacheControl==0.14.0
cachetools==5.5.0
certifi==2024.7.4
cffi==1.17.0
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.3.0
cryptography==43.0.0
cycler==0.12.1
dataclasses-json==0.6.7
deepdiff==8.0.0
Deprecated==1.2.14
docstring_parser==0.16
effdet==0.4.1
emoji==2.12.1
et-xmlfile==1.1.0
filelock==3.15.4
filetype==1.2.0
firebase-admin==6.5.0
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.6.1
google-api-core==2.19.2
google-api-python-client==2.142.0
google-auth==2.34.0
google-auth-httplib2==0.2.0
google-cloud-aiplatform==1.64.0
google-cloud-bigquery==3.25.0
google-cloud-core==2.4.1
google-cloud-firestore==2.18.0
google-cloud-pubsub==2.23.0
google-cloud-resource-manager==1.12.5
google-cloud-secret-manager==2.20.2
google-cloud-storage==2.18.2
google-cloud-vision==3.7.4
google-crc32c==1.5.0
google-resumable-media==2.7.2
googleapis-common-protos==1.65.0
greenlet==3.0.3
grpc-google-iam-v1==0.13.1
grpcio==1.66.0
grpcio-status==1.66.0
h11==0.14.0
httpcore==1.0.5
httplib2==0.22.0
httpx==0.27.2
httpx-sse==0.4.0
huggingface-hub==0.24.6
humanfriendly==10.0
idna==3.8
iopath==0.1.10
Jinja2==3.1.4
joblib==1.4.2
jsonpatch==1.33
jsonpath-python==1.0.6
jsonpointer==3.0.0
kiwisolver==1.4.5
langchain==0.2.15
langchain-community==0.2.13
langchain-core==0.2.35
langchain-google-community==1.0.8
langchain-google-vertexai==1.0.10
langchain-postgres==0.0.9
langchain-text-splitters==0.2.2
langdetect==1.0.9
langsmith==0.1.106
layoutparser==0.3.4
lxml==5.3.0
MarkupSafe==2.1.5
marshmallow==3.22.0
matplotlib==3.9.2
mpmath==1.3.0
msgpack==1.0.8
multidict==6.0.5
mypy-extensions==1.0.0
nest-asyncio==1.6.0
networkx==3.3
nltk==3.9.1
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.20
nvidia-nvtx-cu12==12.1.105
omegaconf==2.3.0
onnx==1.16.2
onnxruntime==1.19.0
opencv-contrib-python==4.10.0.84
opencv-python==4.10.0.84
openpyxl==3.1.5
orderly-set==5.2.1
orjson==3.10.7
packaging==24.1
pandas==2.2.2
pdf2image==1.17.0
pdfminer.six==20231228
pdfplumber==0.11.4
pgvector==0.2.5
pi_heif==0.18.0
pikepdf==9.2.0
pillow==10.4.0
poppler-utils==0.1.0
portalocker==2.10.1
proto-plus==1.24.0
protobuf==5.27.4
psutil==6.0.0
psycopg==3.2.1
psycopg-binary==3.2.1
psycopg-pool==3.2.2
pyasn1==0.6.0
pyasn1_modules==0.4.0
pycocotools==2.0.8
pycparser==2.22
pydantic==2.8.2
pydantic_core==2.20.1
PyJWT==2.9.0
pyparsing==3.1.4
pypdf==4.3.1
PyPDF2==3.0.1
pypdfium2==4.30.0
pytesseract==0.3.13
python-dateutil==2.9.0.post0
python-docx==1.1.2
python-iso639==2024.4.27
python-magic==0.4.27
python-multipart==0.0.9
pytz==2024.1
PyYAML==6.0.2
rapidfuzz==3.9.6
regex==2024.7.24
requests==2.32.3
requests-toolbelt==1.0.0
rsa==4.9
safetensors==0.4.4
scipy==1.14.1
setuptools==74.0.0
shapely==2.0.6
six==1.16.0
sniffio==1.3.1
soupsieve==2.6
SQLAlchemy==2.0.32
sympy==1.13.2
tabulate==0.9.0
tenacity==8.3.0
timm==1.0.9
tokenizers==0.19.1
torch==2.4.0
torchvision==0.19.0
tqdm==4.66.5
transformers==4.44.2
triton==3.0.0
typing-inspect==0.9.0
typing_extensions==4.12.2
tzdata==2024.1
unstructured==0.15.8
unstructured-client==0.25.5
unstructured-inference==0.7.36
unstructured.pytesseract==0.3.13
uritemplate==4.1.1
urllib3==2.2.2
wrapt==1.16.0
yarl==1.9.4

Ubuntu machine following packages installed:


apt-get install python3-opencv
apt-get reinstall pkgconf-bin
apt-get install pkg-config
apt-get install -y poppler-utils

langchain-ai / langchain