langchain-ai / langchainjs

🦜🔗 Build context-aware reasoning applications 🦜🔗
https://js.langchain.com/docs/
MIT License
11.9k stars 1.99k forks source link

Failing to load dependencies & chromadb error #646

Closed l4b4r4b4b4 closed 1 year ago

l4b4r4b4b4 commented 1 year ago

I am running LangChain with Next.js 13 in a Docker container. While I am able to ingest PDFs from the project root outside Docker but into ChromaDB running in another Docker container, the whole process fails when I am trying to do that in a NextJS api route!

First it complained to not be able to load the needed dependencies (d3-dsv, mammoth, epub2, pupeteer, srt-parser-2, cohere-ai, @dqbd/tiktoken and hnswlib-node) now it actually does ingest the docs, connects to ChromaDB and loads them successfully, it still breaks off saying:

Error in /api/document/upload: TypeError: Cannot read properties of undefined (reading 'data')
    at /usr/src/app/node_modules/.pnpm/chromadb@1.3.1/node_modules/chromadb/dist/main/index.js:136:29
   at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
   at async Collection.add (/usr/src/app/node_modules/.pnpm/chromadb@1.3.1/node_modules/chromadb/dist/main/index.js:124:26)
   at async Chroma.addVectors (webpack-internal:///(api)/./node_modules/.pnpm/langchain@0.0.48_@dqbd+tiktoken@1.0.3_chromadb@1.3.1_cohere-ai@6.2.0_d3-dsv@3.0.1_epub2@3.0.1_tnsy5tx2rwt6ouwh4fpahcqrzy/node_modules/langchain/dist/vectorstores/chroma.js:85:9)
   at async Chroma.addDocuments (webpack-internal:///(api)/./node_modules/.pnpm/langchain@0.0.48_@dqbd+tiktoken@1.0.3_chromadb@1.3.1_cohere-ai@6.2.0_d3-dsv@3.0.1_epub2@3.0.1_tnsy5tx2rwt6ouwh4fpahcqrzy/node_modules/langchain/dist/vectorstores/chroma.js:46:9)
  at async Chroma.fromDocuments (webpack-internal:///(api)/./node_modules/.pnpm/langchain@0.0.48_@dqbd+tiktoken@1.0.3_chromadb@1.3.1_cohere-ai@6.2.0_d3-dsv@3.0.1_epub2@3.0.1_tnsy5tx2rwt6ouwh4fpahcqrzy/node_modules/langchain/dist/vectorstores/chroma.js:127:9)
  at async upload (webpack-internal:///(api)/./pages/api/document/upload.ts:42:29)

General Setup:

  1. pnpm
  2. node v19
  3. NextJS 13:canary-32

My API route:

// pages/api/document/upload.ts (or .js)

import { NextApiRequest, NextApiResponse } from "next";
import formidable, { File } from "formidable";
import { CustomPDFLoader } from "@/lib/langchain/customPDFLoader";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { OpenAIEmbeddings } from "langchain/embeddings";
import { Chroma } from "langchain/vectorstores";

const upload = async (req: NextApiRequest, res: NextApiResponse) => {
  if (req.method !== "POST") {
    res.status(405).json({ error: "Method not allowed" });
    return;
  }

  try {
    const data = await parseFormData(req);
    const files = data.files;
    // Process the uploaded files (e.g., save them to a storage service or the filesystem)
    // ...
    const pdfFilePath = files[0].filepath;
    const fileName = files[0].newFilename;
    console.log(files[0]);
    const loader = new CustomPDFLoader(pdfFilePath);

    const rawDoc = await loader.load();
    // console.log(files[0]);
    const textSplitter = new RecursiveCharacterTextSplitter({
      chunkSize: 1000,
      chunkOverlap: 200,
    });

    const doc = await textSplitter.splitDocuments(rawDoc);
    console.log(doc[0]);
    const vectorStore = await Chroma.fromDocuments(
      doc,
      new OpenAIEmbeddings({
        openAIApiKey: process.env.OPENAI_API_KEY!,
      }),
      {
        collectionName: fileName,
        url: "http://chromadb:8000",
      }
    );
    res.status(200).json({ message: "Upload successful" });
  } catch (error) {
    console.error("Error in /api/document/upload:", error);
    res.status(500).json({ error: "Failed to upload files" });
  }
};

const parseFormData = (req: NextApiRequest) => {
  return new Promise<{ files: File[] }>((resolve, reject) => {
    const form = new formidable.IncomingForm();

    form.parse(req, (err, _fields, files) => {
      if (err) {
        reject(err);
        return;
      }

      // Ensure files.files is an array
      const uploadedFiles = Array.isArray(files.files)
        ? files.files
        : [files.files];
      resolve({ files: uploadedFiles });
    });
  });
};

export default upload;

export const config = {
  api: {
    bodyParser: false,
  },
};

package.json

{
  "private": true,
  "type": "module",
  "version": "0.0.1",
  "license": "MIT",
  "author": "Lucas Hänke de Cansino<lhc@next-boss.eu>",
  "engines": {
    "node": ">=18"
  },
  "scripts": {
    "dev": "next dev",
    "build": "next build",
    "lint": "eslint .",
    "lint:fix": "eslint . --fix",
    "typecheck": "tsc --noEmit",
    "prettier": "prettier --write \"./src/**/*.{ts,tsx}\"",
    "prettier:check": "prettier --check \"./src/**/*.{ts,tsx}\"",
    "storybook": "pnpm storybook dev -p 6006",
    "build-storybook": "storybook build",
    "start": "next start",
    "ingest": "tsx -r dotenv/config scripts/ingest-data.ts"
  },
  "dependencies": {
    "@dqbd/tiktoken": "^1.0.3",
    "@formatjs/intl-localematcher": "^0.2.32",
    "@headlessui/react": "^1.7.13",
    "@heroicons/react": "^2.0.17",
    "@hookform/resolvers": "^2.9.11",
    "@next/mdx": "^13.2.4",
    "@react-leaflet/core": "^2.1.0",
    "@stripe/stripe-js": "^1.52.0",
    "@types/mjml": "4.7.0",
    "@types/mjml-react": "2.0.6",
    "@types/nodemailer": "6.4.7",
    "bcryptjs": "^2.4.3",
    "chroma": "^0.0.1",
    "chromadb": "^1.3.1",
    "class-variance-authority": "^0.4.0",
    "clsx": "^1.2.1",
    "cohere-ai": "^6.2.0",
    "cookies-next": "^2.1.1",
    "d3-dsv": "^3.0.1",
    "date-fns": "^2.29.3",
    "dotenv": "^16.0.3",
    "epub2": "^3.0.1",
    "eslint-config-next": "^13.2.1",
    "eventsource-parser": "^0.1.0",
    "formidable": "^2.1.1",
    "framer-motion": "^10.10.0",
    "hnswlib-node": "^1.4.2",
    "html-to-text": "^9.0.5",
    "jsonwebtoken": "^9.0.0",
    "katex": "^0.16.4",
    "langchain": "^0.0.48",
    "ldapjs": "2.3.3",
    "ldapjs-promise": "^2.0.1",
    "leaflet": "^1.9.3",
    "leaflet-defaulticon-compatibility": "^0.1.1",
    "leaflet.markercluster": "^1.5.3",
    "mammoth": "^1.5.1",
    "mjml": "^4.13.0",
    "mjml-react": "^2.0.8",
    "mongodb": "^5.1.0",
    "multer": "^1.4.5-lts.1",
    "negotiator": "^0.6.3",
    "next": "13.2.5-canary.32",
    "next-auth": "^4.21.0",
    "next-connect": "^0.13.0",
    "next-fonts": "^1.5.1",
    "next-themes": "^0.2.1",
    "nodemailer": "^6.9.1",
    "nprogress": "^0.2.0",
    "openai": "^3.1.0",
    "pdf-parse": "^1.1.1",
    "pdfjs-dist": "^3.5.141",
    "prismjs": "^1.29.0",
    "puppeteer": "^19.8.3",
    "react": "^18.2.0",
    "react-cookie": "^4.1.1",
    "react-cookie-consent": "^8.0.1",
    "react-country-flag": "^3.1.0",
    "react-dom": "^18.2.0",
    "react-dropzone": "^14.2.3",
    "react-flip-numbers": "^3.0.7",
    "react-hook-form": "^7.43.9",
    "react-hot-toast": "^2.4.0",
    "react-leaflet": "^4.2.0",
    "react-markdown": "^8.0.6",
    "react-player": "^2.11.2",
    "react-social-icons": "^5.15.0",
    "react-syntax-highlighter": "^15.5.0",
    "react-wrap-balancer": "^0.4.0",
    "rehype": "^12.0.1",
    "rehype-code-title": "^1.0.0",
    "rehype-format": "^4.0.1",
    "rehype-katex": "^6.0.2",
    "rehype-mathjax": "^4.0.2",
    "rehype-prism": "^2.2.2",
    "rehype-prism-plus": "^1.5.1",
    "rehype-raw": "^6.1.1",
    "rehype-sanitize": "^5.0.1",
    "rehype-stringify": "^9.0.3",
    "remark": "^14.0.2",
    "remark-emoji": "^3.1.1",
    "remark-gfm": "^3.0.1",
    "remark-html": "^15.0.2",
    "remark-math": "^5.1.1",
    "remark-parse": "^10.0.1",
    "remark-prism": "^1.3.6",
    "remark-rehype": "^10.1.0",
    "server-only": "^0.0.1",
    "sharp": "^0.31.3",
    "srt-parser-2": "^1.2.2",
    "stripe": "^11.17.0",
    "swr": "^2.1.2",
    "uuid": "^9.0.0",
    "validator": "^13.9.0",
    "validatorjs": "^3.22.1",
    "web-streams-polyfill": "^3.2.1",
    "zod": "^3.20.6"
  },
  "devDependencies": {
    "@storybook/addon-actions": "^6.5.16",
    "@storybook/addon-essentials": "^6.5.16",
    "@storybook/addon-interactions": "^6.5.16",
    "@storybook/addon-links": "^6.5.16",
    "@storybook/blocks": "^7.0.0-alpha.8",
    "@storybook/nextjs": "^7.0.0-alpha.41",
    "@storybook/react": "^6.5.16",
    "@storybook/testing-library": "^0.0.14-next.1",
    "@tailwindcss/aspect-ratio": "^0.4.2",
    "@tailwindcss/forms": "^0.5.3",
    "@tailwindcss/typography": "^0.5.9",
    "@types/aria-query": "5.0.1",
    "@types/babel__core": "7.20.0",
    "@types/babel__generator": "7.6.4",
    "@types/babel__template": "7.4.1",
    "@types/babel__traverse": "7.18.3",
    "@types/bcryptjs": "2.4.2",
    "@types/body-parser": "1.19.2",
    "@types/connect": "3.4.35",
    "@types/eslint": "^8.37.0",
    "@types/eslint-scope": "3.7.4",
    "@types/estree": "0.0.51",
    "@types/express": "4.17.17",
    "@types/express-serve-static-core": "4.17.33",
    "@types/formidable": "^2.0.5",
    "@types/glob": "8.1.0",
    "@types/graceful-fs": "4.1.6",
    "@types/hast": "2.3.4",
    "@types/html-minifier-terser": "5.1.2",
    "@types/html-to-text": "9.0.0",
    "@types/is-function": "1.0.1",
    "@types/istanbul-lib-coverage": "2.0.4",
    "@types/istanbul-lib-report": "3.0.0",
    "@types/istanbul-reports": "3.0.1",
    "@types/json-schema": "7.0.11",
    "@types/json5": "0.0.29",
    "@types/jsonwebtoken": "^9.0.1",
    "@types/ldapjs": "2.2.5",
    "@types/lodash": "^4.14.192",
    "@types/mdast": "^3.0.11",
    "@types/mime": "3.0.1",
    "@types/mime-types": "2.1.1",
    "@types/minimatch": "5.1.2",
    "@types/mjml-core": "4.7.1",
    "@types/multer": "^1.4.7",
    "@types/negotiator": "0.6.1",
    "@types/node": "^18.15.11",
    "@types/node-fetch": "^2.6.3",
    "@types/normalize-package-data": "2.4.1",
    "@types/npmlog": "4.1.4",
    "@types/nprogress": "0.2.0",
    "@types/parse-json": "4.0.0",
    "@types/parse5": "5.0.3",
    "@types/pdf-parse": "^1.1.1",
    "@types/pretty-hrtime": "1.0.1",
    "@types/prop-types": "15.7.5",
    "@types/qs": "6.9.7",
    "@types/range-parser": "1.2.4",
    "@types/react": "^18.0.32",
    "@types/react-dom": "^18.0.11",
    "@types/react-syntax-highlighter": "^15.5.6",
    "@types/remark-prism": "^1.3.4",
    "@types/scheduler": "^0.16.3",
    "@types/semver": "7.3.13",
    "@types/serve-static": "1.15.1",
    "@types/source-list-map": "0.1.2",
    "@types/tapable": "1.0.8",
    "@types/uglify-js": "3.17.1",
    "@types/unist": "^2.0.6",
    "@types/uuid": "^9.0.1",
    "@types/validatorjs": "3.15.0",
    "@types/webidl-conversions": "7.0.0",
    "@types/webpack": "4.41.33",
    "@types/webpack-env": "1.18.0",
    "@types/webpack-sources": "3.2.0",
    "@types/whatwg-url": "11.0.0",
    "@types/yargs": "15.0.15",
    "@types/yargs-parser": "21.0.0",
    "autoprefixer": "^10.4.12",
    "eslint": "^8.37.0",
    "eslint-plugin-import": "^2.27.5",
    "eslint-plugin-jsx-a11y": "^6.7.1",
    "eslint-plugin-react": "^7.32.2",
    "eslint-plugin-react-hooks": "^4.6.0",
    "eslint-plugin-storybook": "^0.6.11",
    "eslint-plugin-unused-imports": "^2.0.0",
    "ignore-loader": "^0.1.2",
    "postcss": "^8.4.18",
    "storybook": "^6.5.16",
    "tailwind-scrollbar": "^2.1.0",
    "tailwindcss": "^3.3.1",
    "tailwindcss-hyphens": "^0.1.0",
    "tsx": "^3.12.6",
    "typescript": "^4.9.5"
  },
  "keywords": [
    "starter",
    "gpt4",
    "pinecone",
    "typescript",
    "nextjs",
    "langchain",
    "law",
    "legal",
    "pdf",
    "openai"
  ]
}

Dockerfile:

# Creates a layer from node:19-buster image.
FROM node:19-buster

RUN apt-get update && \
    apt-get install -y build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev python

# Install pnpm
RUN npm install -g pnpm

# Creates directories
RUN mkdir -p /usr/src/app

# Sets an environment variable
ENV PORT 3000

# Sets the working directory for any RUN, CMD, ENTRYPOINT, COPY, and ADD commands
WORKDIR /usr/src/app

# Copy new files or directories into the filesystem of the container
COPY pnpm-lock.yaml /usr/src/app
COPY package.json /usr/src/app

# Execute commands in a new layer on top of the current image and commit the results
RUN pnpm install

# Copy new files or directories into the filesystem of the container
COPY . /usr/src/app

# Informs container runtime that the container listens on the specified network ports at runtime
EXPOSE 3000

# Allows you to configure a container that will run as an executable
ENTRYPOINT ["pnpm", "run", "dev"]
nfcampos commented 1 year ago

That sounds like an issue inside the Chroma SDK, maybe @jeffchuber might be able to help?

l4b4r4b4b4 commented 1 year ago

hmm I actually dont think so, since it generally has issues resolving the needed dependencies and I only fixed that by manually installing the dependencies into the root project. So I think the remaining problem with ChromaDB is still connected to that. Because Chroma takes the embeddings just fine now: Bildschirmaufzeichnung vom 06.04.2023, 13:50:47.webm

l4b4r4b4b4 commented 1 year ago

I will try to narrow down the problem by running outside docker container and and stripped down project. Maybe something will come out of that!

l4b4r4b4b4 commented 1 year ago

all good. there were two problems, both NOT connected to langchain!

  1. I did not know that 'peer dependencies' have to be installed by hand into the root project.
  2. Is connected to a property of some PDFs. Don't understand why that is the case with some of them, but generally the pipeline works :)