langchain-ai / langchainjs

🦜🔗 Build context-aware reasoning applications 🦜🔗
https://js.langchain.com/docs/
MIT License
11.9k stars 1.99k forks source link

Request to OpenAI Embeddings too long #1581

Closed codergautam closed 10 months ago

codergautam commented 1 year ago

I have a list of long pdfs (20+) which I want to use in Pinecone DB. I have used some code to convert them into .txt files.

Now here is the code that is supposed to split them into chunks and feed them into vector database.

Data is just an array with text of each pdf. I want docs to be the chunked text which should be sent to embeddings.

const textSplitter = new RecursiveCharacterTextSplitter({
  chunkSize: 2000,
  chunkOverlap: 100,
});

let docs = [];
for (const d of data) {
  const docOutput = textSplitter.splitText(d);
  if(docOutput.length > 1) {
  docs = [...docs, ...docOutput];
  } else {
  docs.push(d);
  }
}

console.log("Initializing Store...");

let store;
if(pineconeIndex) {
  await PineconeStore.fromTexts(
    docs,
    docs.map((doc,i) => {return {id: i+1}}),
    new OpenAIEmbeddings({
      openAIApiKey: process.env.OPENAI_API_KEY
    }),
    {
      pineconeIndex,
    }
  )
}

But I am getting an error on the call to OpenAI embeddings. It seems like the request is being set with way more text than expected. I have logged the docs variable but each doc or chunk is less than 2000 characters only. Then why is this much data being sent?

Error:

  request: <ref *1> ClientRequest {
    _events: [Object: null prototype] {
      abort: [Function (anonymous)],
      aborted: [Function (anonymous)],
      connect: [Function (anonymous)],
      error: [Function (anonymous)],
      socket: [Function (anonymous)],
      timeout: [Function (anonymous)],
      finish: [Function: requestOnFinish]
    },
    _eventsCount: 7,
    _maxListeners: undefined,
    outputData: [],
    outputSize: 0,
    writable: true,
    destroyed: false,
    _last: true,
    chunkedEncoding: false,
    shouldKeepAlive: false,
    maxRequestsOnConnectionReached: false,
    _defaultKeepAlive: true,
    useChunkedEncodingByDefault: true,
    sendDate: false,
    _removedConnection: false,
    _removedContLen: false,
    _removedTE: false,
    strictContentLength: false,
    _contentLength: 6000558,
    _hasBody: true,
    _trailer: '',
    finished: true,
    _headerSent: true,
    _closed: false,
    socket: TLSSocket {
      _tlsOptions: [Object],
      _secureEstablished: true,
      _securePending: false,
      _newSessionPending: false,
      _controlReleased: true,
      secureConnecting: false,
      _SNICallback: null,
      servername: 'api.openai.com',
      alpnProtocol: false,
      authorized: true,
      authorizationError: null,
      encrypted: true,
      _events: [Object: null prototype],
      _eventsCount: 10,
      connecting: false,
      _hadError: false,
      _parent: null,
      _host: 'api.openai.com',
      _closeAfterHandlingError: false,
      _readableState: [ReadableState],
      _maxListeners: undefined,
      _writableState: [WritableState],
      allowHalfOpen: false,
      _sockname: null,
      _pendingData: null,
      _pendingEncoding: '',
      server: undefined,
      _server: null,
      ssl: [TLSWrap],
      _requestCert: true,
      _rejectUnauthorized: true,
      parser: null,
      _httpMessage: [Circular *1],
      [Symbol(res)]: [TLSWrap],
      [Symbol(verified)]: true,
      [Symbol(pendingSession)]: null,
      [Symbol(async_id_symbol)]: 222,
      [Symbol(kHandle)]: [TLSWrap],
      [Symbol(lastWriteQueueSize)]: 0,
      [Symbol(timeout)]: null,
      [Symbol(kBuffer)]: null,
      [Symbol(kBufferCb)]: null,
      [Symbol(kBufferGen)]: null,
      [Symbol(kCapture)]: false,
      [Symbol(kSetNoDelay)]: false,
      [Symbol(kSetKeepAlive)]: true,
      [Symbol(kSetKeepAliveInitialDelay)]: 60,
      [Symbol(kBytesRead)]: 0,
      [Symbol(kBytesWritten)]: 0,
      [Symbol(connect-options)]: [Object]
    },
    _header: 'POST /v1/embeddings HTTP/1.1\r\n' +
      'Accept: application/json, text/plain, */*\r\n' +
      'Content-Type: application/json\r\n' +
      'User-Agent: OpenAI/NodeJS/3.2.1\r\n' +
      'Authorization: Bearer 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\r\n' +
      'Content-Length: 6000558\r\n' +
      'Host: api.openai.com\r\n' +
      'Connection: close\r\n' +
      '\r\n',
    _keepAliveTimeout: 0,
    _onPendingData: [Function: nop],
    agent: Agent {
      _events: [Object: null prototype],
      _eventsCount: 2,
      _maxListeners: undefined,
      defaultPort: 443,
      protocol: 'https:',
      options: [Object: null prototype],
      requests: [Object: null prototype] {},
      sockets: [Object: null prototype],
      freeSockets: [Object: null prototype] {},
      keepAliveMsecs: 1000,
      keepAlive: false,
      maxSockets: Infinity,
      maxFreeSockets: 256,
      scheduling: 'lifo',
      maxTotalSockets: Infinity,
      totalSocketCount: 1,
      maxCachedSessions: 100,
      _sessionCache: [Object],
      [Symbol(kCapture)]: false
    },
    socketPath: undefined,
    method: 'POST',
    maxHeaderSize: undefined,
    insecureHTTPParser: undefined,
    joinDuplicateHeaders: undefined,
    path: '/v1/embeddings',
    _ended: true,
    res: IncomingMessage {
      _readableState: [ReadableState],
      _events: [Object: null prototype],
      _eventsCount: 4,
      _maxListeners: undefined,
      socket: [TLSSocket],
      httpVersionMajor: 1,
      httpVersionMinor: 1,
      httpVersion: '1.1',
      complete: true,
      rawHeaders: [Array],
      rawTrailers: [],
      joinDuplicateHeaders: undefined,
      aborted: false,
      upgrade: false,
      url: '',
      method: null,
      statusCode: 400,
      statusMessage: 'Bad Request',
      client: [TLSSocket],
      _consuming: false,
      _dumped: false,
      req: [Circular *1],
      responseUrl: 'https://api.openai.com/v1/embeddings',
      redirects: [],
      [Symbol(kCapture)]: false,
      [Symbol(kHeaders)]: [Object],
      [Symbol(kHeadersCount)]: 34,
      [Symbol(kTrailers)]: null,
      [Symbol(kTrailersCount)]: 0
    },
    aborted: false,
    timeoutCb: null,
    upgradeOrConnect: false,
    parser: null,
    maxHeadersCount: null,
    reusedSocket: false,
    host: 'api.openai.com',
    protocol: 'https:',
    _redirectable: Writable {
      _writableState: [WritableState],
      _events: [Object: null prototype],
      _eventsCount: 3,
      _maxListeners: undefined,
      _options: [Object],
      _ended: true,
      _ending: true,
      _redirectCount: 0,
      _redirects: [],
      _requestBodyLength: 6000558,
      _requestBodyBuffers: [],
      _onNativeResponse: [Function (anonymous)],
      _currentRequest: [Circular *1],
      _currentUrl: 'https://api.openai.com/v1/embeddings',
      [Symbol(kCapture)]: false
    },
    [Symbol(kCapture)]: false,
    [Symbol(kBytesWritten)]: 0,
    [Symbol(kNeedDrain)]: false,
    [Symbol(corked)]: 0,
    [Symbol(kOutHeaders)]: [Object: null prototype] {
      accept: [Array],
      'content-type': [Array],
      'user-agent': [Array],
      authorization: [Array],
      'content-length': [Array],
      host: [Array]
    },
    [Symbol(errored)]: null,
    [Symbol(kUniqueHeaders)]: null
  },
  response: {
    status: 400,
    statusText: 'Bad Request',
    headers: {
      date: 'Thu, 08 Jun 2023 02:21:04 GMT',
      'content-type': 'application/json',
      'content-length': '294',
      connection: 'close',
      'access-control-allow-origin': '*',
      'openai-organization': 'coder-gautam',
      'openai-processing-ms': '3201',
      'openai-version': '2020-10-01',
      'strict-transport-security': 'max-age=15724800; includeSubDomains',
      'x-ratelimit-limit-requests': '3000',
      'x-ratelimit-remaining-requests': '2999',
      'x-ratelimit-reset-requests': '20ms',
      'x-request-id': '323561b3096ab4283773a11455e2d86f',
      'cf-cache-status': 'DYNAMIC',
      server: 'cloudflare',
      'cf-ray': '7d3da8977fb822c8-ORD',
      'alt-svc': 'h3=":443"; ma=86400'
    },
    config: {
      transitional: [Object],
      adapter: [Function: httpAdapter],
      transformRequest: [Array],
      transformResponse: [Array],
      timeout: 0,
      xsrfCookieName: 'XSRF-TOKEN',
      xsrfHeaderName: 'X-XSRF-TOKEN',
      maxContentLength: -1,
      maxBodyLength: -1,
      validateStatus: [Function: validateStatus],
      headers: [Object],
      method: 'post',
      data: `{"model":"text-embedding-ada-002","input":["\\fIntroduction/Guideline Methodology  Antibiotic Prophylaxis in Spine Surgery | NASS Clinical Guidelines  1  Evidence-Based Clinical Guidelines for Multidisciplinary Spine Care  Antibiotic Prophylaxis in Spine Surgery  NASS Evidence-Based Clinical Guidelines Committee William O. Shaffer, MD Committee Chair Jamie Baisden, MD Robert Fernand, MD Paul Matz, MD This clinical guideline should not be construed as including all proper methods of care or excluding or other acceptable methods of care reasonably directed to obtaining the same results. The ultimate judgment regarding any specific procedure or treatment is to be made by the physician and patient in light of all circumstances presented by the patient and the needs and resources particular to the locality or institution.  \\fIntroduction/Guideline Methodology  2  Antibiotic Prophylaxis in Spine Surgery | NASS Clinical Guidelines  Financial Statement This clinical guideline was developed and funded in its entirety by the North American Spine Society (NASS).All participating authors have disclosed potential conflicts of interest consistent with NASS' disclosure policy. Disclosures are listed below:  William O. Shaffer, MD Paul G. Matz, MD Jamie Baisden, MD Robert Fernand, MD   Range Key: Level A. $100 to $1,000 Level B. $1,001 to $10,000 Level C. $10,001 to $25,000 Level D. $25,001 to $50,000 Level E. $50,001 to $100,000 Level F. $100,001 to $500,000 Level G. $500,001 to $1M Level H. $1,000,001 to $2.5M Level I. Greater than $2.5M  Comments Comments regarding the guideline may be submitted to the North American Spine Society and will be considered in development of future revisions of the work. Special Thanks The North American Spine Society would like to express its thanks to Dr. Nikolai Bogduk for generating the calculations in Appendix E to explain the prohibitive nature of the sample sizes required to yield Level I data for the efficacy of antibiotic prophylaxis. North American Spine Society Clinical Guidelines for Multidisciplinary Spine Care Antibiotic Prophylaxis in Spine Surgery Copyright � 2013 North American Spine Society 7075 Veterans Boulevard Burr Ridge, IL 60527 USA 630.230.3600 www.spine.org ISBN 1-929988-31-1 This clinical guideline should not be construed as including all proper methods of care or excluding or other acceptable methods of care reasonably directed to obtaining the same results. The ultimate judgment regarding any specific procedure or treatment is to be made by the physician and patient in light of all circumstances presented by the patient and the needs and resources particular to the locality or institution.  \\fIntroduction/Guideline Methodology  Antibiotic Prophylaxis in Spine Surgery | NASS Clinical Guidelines  3  Table of Contents  I.   A technical report, including the literature search parameters and evidentiary tables developed by the authors, can be accessed at http://www.spine.org/Documents/Antibiotic_Prophylaxis_TechRept.pdf  This clinical guideline should not be construed as including all proper methods of care or excluding or other acceptable methods of care reasonably directed to obtaining the same results. The ultimate judgment regarding any specific procedure or treatment is to be made by the physician and patient in light of all circumstances presented by the patient and the needs and resources particular to the locality or institution.  \\fIntroduction/Guideline Methodology  4 I. Introduction  Antibiotic Prophylaxis in Spine Surgery | NASS Clinical Guidelines  Objective The objective of the North American Spine Society (NASS) Evidence-Based Clinical Guideline on Antibiotic Prophylaxis in Spine Surgery is to provide evidence-based recommendations to address key clinical questions surrounding the use of prophylactic antibiotics in spine surgery. The guideline is intended to address these questions based on the highest quality clinical literature available on this subject as of June 2011. The goals of the guideline recommendations are to assist in delivering optimum, efficacious treatment with the goal of preventing surgical infection. Scope, Purpose and Intended User This document was developed by the North American Spine Society Evidence-based Guideline Development Committee as an educational tool to assist spine surgeons in preventing surgical site infections. This guideline is an update to the 2007 version. The NASS Clinical Guideline on Antibiotic Prophylaxis in Spine Surgery addresses the efficacy and appropriate protocol for antibiotic prophylaxis and discusses redosing, discontinuation, wound drains, as well as special considerations related to the potential impact of comorbidities on antibiotic prophylaxis proto-  col. The recommendations made in this guideline are based on evidence related to open procedures. No evidence was reviewed related to efficacy and protocol for the use of antibiotic prophylaxis in percutaneous procedures. THIS GUIDELINE DOES NOT REPRESENT A \\"STANDARD OF CARE,\\" nor is it intended as a fixed treatment protocol. It is anticipated that there will be patients who will require less or more treatment than the average. It is also acknowledged that in atypical cases, treatment falling outside this guideline will sometimes be necessary. This guideline should not be seen as prescribing the type, frequency or duration of intervention. Treatment should be based on the individual patient's need and doctor's professional judgment. This document is designed to function as a guideline and should not be used as the sole reason for denial of treatment and services. This guideline is not intended to expand or restrict a health care provider's scope of practice or to supersede applicable ethical standards or provisions of law. Patient Population The patient population for this guideline encompasses adults (18 years or older) undergoing spine surgery.  This clinical guideline should not be construed as including all proper methods of care or excluding or other acceptable methods of care reasonably directed to obtaining the same results. The ultimate judgment regarding any specific procedure or treatment is to be made by the physician and patient in light of all circumstances presented by the patient and the needs and resources particular to the locality or institution.  \\fIntroduction/Guideline Methodology  Antibiotic Prophylaxis in Spine Surgery | NASS Clinical Guidelines  5  II. Guideline Development Methodology  Through objective evaluation of the evidence and transparency in the process of making recommendations, it is NASS' goal to develop evidence-based clinical practice guidelines for the diagnosis and treatment of adult patients with various spinal conditions. These guidelines are developed for educational purposes to assist practitioners in their clinical decision-making processes. It is anticipated that where evidence is very strong in support of recommendations, these recommendations will be operationalized into performance measures. Multidisciplinary Collaboration With the goal of ensuring the best possible care for adult patients suffering with spinal disorders, NASS is committed to multidisciplinary involvement in the process of guideline and performance measure development. To this end, NASS has ensured that representatives from medical, interventional and surgical spine specialties have participated in the development and review of all NASS guidelines. To ensure broad-based representation, NASS has invited and welcomes input from other societies and specialties Evidence Analysis Training of All NASS Guideline Developers NASS has initiated, in conjunction with the University of Alberta's Centre for Health Evidence, an online training program geared toward educating guideline developers about evidence analysis and guideline development. All participants in guideline development for NASS have completed the training prior to participating in the guideline development program at NASS. This training includes a series of readings and exercises, or interactivities, to prepare guideline developers for systematically evaluating literature and developing evidence-based guidelines. The online course takes approximately 15-30 hours to complete and participants have been awarded CME credit upon completion of the course. Disclosure of Potential Conflicts of Interest All participants involved in guideline development have disclosed potential conflicts of interest to their colleagues and their potential conflicts have been documented in this guideline. Participants have been asked to update their disclosures regularly throughout the guideline development process.  Levels of Evidence and Grades of Recommendation NASS has adopted standardized levels of evidence (Appendix A) and grades of recommendation (Appendix B) to assist practitioners in easily understanding the strength of the evidence and recommendations within the guidelines. The levels of evidence range from Level I (high quality randomized controlled trial) to Level V (expert consensus). Grades of recommendation indi-  cate the strength of the recommendations made in the guideline based on the quality of the literature. Grades of Recommendation: A: Good evidence (Level I studies with consistent findings) for or against recommending intervention. B: Fair evidence (Level II or III studies with consistent findings) for or against recommending intervention. C: Poor quality evidence (Level IV or V studies) for or against recommending intervention. I: Insufficient or conflicting evidence not allowing a recommendation for or against intervention. Levels of evidence have very specific criteria and are assigned to studies prior to developing rec-ommendations. Recommendations are then graded based upon the level of evidence. To better un-derstand how levels of evidence inform the grades of recommendation and the standard nomencla-ture used within the recommendations see Appendix C. Guideline recommendations ar`... 5986486 more characters,
      url: 'https://api.openai.com/v1/embeddings'
    },
    request: <ref *1> ClientRequest {
      _events: [Object: null prototype],
      _eventsCount: 7,
      _maxListeners: undefined,
      outputData: [],
      outputSize: 0,
      writable: true,
      destroyed: false,
      _last: true,
      chunkedEncoding: false,
      shouldKeepAlive: false,
      maxRequestsOnConnectionReached: false,
      _defaultKeepAlive: true,
      useChunkedEncodingByDefault: true,
      sendDate: false,
      _removedConnection: false,
      _removedContLen: false,
      _removedTE: false,
      strictContentLength: false,
      _contentLength: 6000558,
      _hasBody: true,
      _trailer: '',
      finished: true,
      _headerSent: true,
      _closed: false,
      socket: [TLSSocket],
      _header: 'POST /v1/embeddings HTTP/1.1\r\n' +
        'Accept: application/json, text/plain, */*\r\n' +
        'Content-Type: application/json\r\n' +
        'User-Agent: OpenAI/NodeJS/3.2.1\r\n' +
        'Authorization: Bearer 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\r\n' +
        'Content-Length: 6000558\r\n' +
        'Host: api.openai.com\r\n' +
        'Connection: close\r\n' +
        '\r\n',
      _keepAliveTimeout: 0,
      _onPendingData: [Function: nop],
      agent: [Agent],
      socketPath: undefined,
      method: 'POST',
      maxHeaderSize: undefined,
      insecureHTTPParser: undefined,
      joinDuplicateHeaders: undefined,
      path: '/v1/embeddings',
      _ended: true,
      res: [IncomingMessage],
      aborted: false,
      timeoutCb: null,
      upgradeOrConnect: false,
      parser: null,
      maxHeadersCount: null,
      reusedSocket: false,
      host: 'api.openai.com',
      protocol: 'https:',
      _redirectable: [Writable],
      [Symbol(kCapture)]: false,
      [Symbol(kBytesWritten)]: 0,
      [Symbol(kNeedDrain)]: false,
      [Symbol(corked)]: 0,
      [Symbol(kOutHeaders)]: [Object: null prototype],
      [Symbol(errored)]: null,
      [Symbol(kUniqueHeaders)]: null
    },
    data: { error: [Object] }
  },
  isAxiosError: true,
  toJSON: [Function: toJSON],
  attemptNumber: 1,
  retriesLeft: 6
}
dosubot[bot] commented 10 months ago

Hi, @codergautam! I'm Dosu, and I'm here to help the LangChain team manage their backlog. I wanted to let you know that we are marking this issue as stale.

From what I understand, you are facing an issue with the OpenAI embeddings API where the request is being set with more data than expected, resulting in an error message indicating a content length of 6000558 characters. There hasn't been any activity or comments on the issue since you posted it.

Before we close this issue, we wanted to check with you if it is still relevant to the latest version of the LangChain repository. If it is, please let us know by commenting on the issue. Otherwise, feel free to close the issue yourself or it will be automatically closed in 7 days.

Thank you for your understanding and contribution to the LangChain project!