langchain-ai / langchainjs

🦜🔗 Build context-aware reasoning applications 🦜🔗
https://js.langchain.com/docs/
MIT License
12.39k stars 2.09k forks source link

FirecrawlLoader web loader not working. #6893

Open DevDeepakBhattarai opened 3 hours ago

DevDeepakBhattarai commented 3 hours ago

Checked other resources

Example Code

export async function addKnowledgeFormExternalSource(
  dbiId: string,
  data: z.infer<typeof schema>,
) {
  const session = await auth();
  if (!session) {
    throw new Error("Unauthorized");
  }

  const userId = session.user.id;

  const parsedData = await schema.safeParseAsync(data);

  if (!parsedData.success) {
    throw new Error("Invalid data");
  }

  const { url, type } = parsedData.data;

  let loader: YoutubeLoader | FireCrawlLoader | SitemapLoader;
  switch (type) {
    case "youtube":
      loader = YoutubeLoader.createFromUrl(url, {
        language: "en",
        addVideoInfo: true,
      });
      break;
    case "webpage":
      loader = new FireCrawlLoader({
        url: url,
        apiKey: env.FIRECRAWL_API_KEY,
        mode: "scrape",
      });
      break;
    case "sitemap":
      loader = new SitemapLoader(url);
      break;
  }
  const docs = await loader.load();

  console.log(docs.length, docs);

  if (docs.length < 1) {
    throw new Error("No docs found");
  }

  const embedding = new OpenAIEmbeddings();

  const fileName =
    type === "youtube"
      ? (docs[0]!.metadata?.title as string)
      : type === "webpage"
        ? (docs[0]!.metadata?.ogTitle as string)
        : (docs[0]?.metadata?.source as string);

  const fileId = await createFileRecord(fileName, dbiId);
  const pinecone = new Pinecone({
    apiKey: env.PINECONE_API_KEY,
  });
  const pineconeIndex = pinecone.Index("allweone");
  const splitter = new RecursiveCharacterTextSplitter({
    chunkSize: 1000,
    chunkOverlap: 200,
  });

  const splitDocs = await splitter.splitDocuments(docs);
  const splitDocsWithMetadata = splitDocs.map((doc) => ({
    ...doc,
    metadata: {
      ...doc.metadata,
      fileId,
    },
  }));

  const store = await PineconeStore.fromDocuments([], embedding, {
    pineconeIndex,
    namespace: `${userId}-${dbiId}`,
  });

  await store.addDocuments(splitDocsWithMetadata, {
    ids: splitDocsWithMetadata.map((_, index) => `${fileId}#${index}`),
  });
}

Error Message and Stack Trace (if applicable)

⨯ TypeError: Cannot read properties of undefined (reading 'markdown') at Array.map () at async addKnowledgeFormExternalSource (./src/app/_actions/dbi/addKnowledge.tsx:86:18)

Description

I am trying to build a rag app where i can pull in knowledge from youtube , website and webpages. For the website. I am using firecrawl but is gives me this error.

I know the firecrawl api is working because I can see the activity logs from firecrawl logs.

image

System Info

pnpm 9.4.0 @langchain/community 0.2.33

dosubot[bot] commented 3 hours ago

The TypeError: Cannot read properties of undefined (reading 'markdown') occurs because the firecrawlDocs array contains an element that is undefined or does not have the markdown property. This can happen if the response from the Firecrawl API does not include the expected data structure.

To fix this issue, you should add a check to ensure that each document in firecrawlDocs is defined and has the markdown property before attempting to access it. Here is the modified load method with the necessary checks:

public async load(): Promise<DocumentInterface[]> {
  const params: ConstructorParameters<typeof FirecrawlApp>[0] = {
    apiKey: this.apiKey,
  };
  if (this.apiUrl !== undefined) {
    params.apiUrl = this.apiUrl;
  }
  const app = new FirecrawlApp(params);
  let firecrawlDocs: FirecrawlDocument[];

  if (this.mode === "scrape") {
    const response = await app.scrapeUrl(this.url, this.params);
    if (!response.success) {
      throw new Error(
        `Firecrawl: Failed to scrape URL. Error: ${response.error}`
      );
    }
    firecrawlDocs = [response.data as FirecrawlDocument];
  } else if (this.mode === "crawl") {
    const response = await app.crawlUrl(this.url, this.params, true);
    firecrawlDocs = response as FirecrawlDocument[];
  } else {
    throw new Error(
      `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`
    );
  }

  return firecrawlDocs
    .filter((doc) => doc && doc.markdown !== undefined)
    .map(
      (doc) =>
        new Document({
          pageContent: doc.markdown || "",
          metadata: doc.metadata || {},
        })
    );
}

This code filters out any undefined documents or documents that do not have the markdown property before mapping them to Document instances [1].

To continue talking to Dosu, mention @dosu.