apify / got-scraping

HTTP client made for scraping based on got.
423 stars 32 forks source link

Invalid `connection: close` header #70

Closed szmarczak closed 11 months ago

szmarczak commented 2 years ago

Occurred randomly. Needs to be fixed in header-generator

require('.').gotScraping.get('http://google.com').on('redirect', o => console.log(o.url.href)).then(res => console.log(res.request.options.headers, res.headers, res.body));
$ node demo.js 
http://www.google.com/
https://www.google.com/?gws_rd=ssl
node:internal/process/promises:246
          triggerUncaughtException(err, true /* fromPromise */);
          ^

RequestError: Invalid 'connection' header: close
    at Request._beforeError (/home/szm/Desktop/got-scraping/node_modules/got-cjs/dist/source/core/index.js:333:21)
    at Request._onResponseBase (/home/szm/Desktop/got-scraping/node_modules/got-cjs/dist/source/core/index.js:731:22)
    at ClientRequest.setHeader (/home/szm/Desktop/got-scraping/node_modules/http2-wrapper/source/client-request.js:520:10)
    at new ClientRequest (/home/szm/Desktop/got-scraping/node_modules/http2-wrapper/source/client-request.js:121:10)
    at module.exports (/home/szm/Desktop/got-scraping/node_modules/http2-wrapper/source/auto.js:195:29)
    at async Request._makeRequest (/home/szm/Desktop/got-scraping/node_modules/got-cjs/dist/source/core/index.js:1000:37)
    at async Request._onResponseBase (/home/szm/Desktop/got-scraping/node_modules/got-cjs/dist/source/core/index.js:728:17)
    at async Request._onResponse (/home/szm/Desktop/got-scraping/node_modules/got-cjs/dist/source/core/index.js:796:13) {
  input: undefined,
  code: 'ERR_GOT_REQUEST_ERROR',
  timings: undefined,
  options: Options {
    _unixOptions: { insecureHTTPParser: true },
    _internals: {
      request: [Function (anonymous)],
      agent: {
        http: TransformHeadersAgent {
          agent: Agent {
            _events: [Object: null prototype],
            _eventsCount: 2,
            _maxListeners: undefined,
            defaultPort: 80,
            protocol: 'http:',
            options: [Object: null prototype],
            requests: [Object: null prototype] {},
            sockets: [Object: null prototype] {},
            freeSockets: [Object: null prototype] {},
            keepAliveMsecs: 1000,
            keepAlive: false,
            maxSockets: Infinity,
            maxFreeSockets: 256,
            scheduling: 'lifo',
            maxTotalSockets: Infinity,
            totalSocketCount: 0,
            [Symbol(kCapture)]: false
          }
        },
        https: TransformHeadersAgent {
          agent: Agent {
            _events: [Object: null prototype],
            _eventsCount: 2,
            _maxListeners: undefined,
            defaultPort: 443,
            protocol: 'https:',
            options: [Object: null prototype],
            requests: [Object: null prototype] {},
            sockets: [Object: null prototype] {},
            freeSockets: [Object: null prototype] {},
            keepAliveMsecs: 1000,
            keepAlive: false,
            maxSockets: Infinity,
            maxFreeSockets: 256,
            scheduling: 'lifo',
            maxTotalSockets: Infinity,
            totalSocketCount: 0,
            maxCachedSessions: 100,
            _sessionCache: [Object],
            [Symbol(kCapture)]: false
          }
        },
        http2: undefined
      },
      h2session: undefined,
      decompress: true,
      timeout: {
        connect: undefined,
        lookup: undefined,
        read: undefined,
        request: 60000,
        response: undefined,
        secureConnect: undefined,
        send: undefined,
        socket: undefined
      },
      prefixUrl: '',
      body: undefined,
      form: undefined,
      json: undefined,
      cookieJar: undefined,
      ignoreInvalidCookies: false,
      searchParams: undefined,
      dnsLookup: undefined,
      dnsCache: undefined,
      context: {
        headerGenerator: HeaderGenerator {
          globalOptions: {
            browsers: [Array],
            operatingSystems: [Array],
            devices: [Array],
            locales: [Array],
            httpVersion: '2',
            browserListQuery: ''
          },
          browserListQuery: undefined,
          inputGeneratorNetwork: BayesianNetwork {
            nodesInSamplingOrder: [Array],
            nodesByName: [Object]
          },
          headerGeneratorNetwork: BayesianNetwork {
            nodesInSamplingOrder: [Array],
            nodesByName: [Object]
          },
          uniqueBrowsers: [
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object], [Object], [Object],
            [Object], [Object], [Object], [Object],
            ... 112 more items
          ]
        },
        useHeaderGenerator: true,
        insecureHTTPParser: true,
        sessionData: undefined
      },
      hooks: {
        init: [
          [Function: optionsValidationHandler],
          [Function: customOptionsHook]
        ],
        beforeRequest: [
          [Function: insecureParserHook],
          [Function: sessionDataHook],
          [Function: http2Hook],
          [AsyncFunction: proxyHook],
          [AsyncFunction: browserHeadersHook],
          [Function: tlsHook]
        ],
        beforeError: [],
        beforeRedirect: [ [Function: refererHook] ],
        beforeRetry: [],
        afterResponse: []
      },
      followRedirect: true,
      maxRedirects: 10,
      cache: undefined,
      throwHttpErrors: false,
      username: '',
      password: '',
      http2: true,
      allowGetBody: false,
      headers: {
        'sec-ch-ua': '"(Not(A:Brand";v="8", "Chromium";v="99", "Google Chrome";v="99"',
        'sec-ch-ua-mobile': '?0',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
        accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-site',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US',
        connection: 'close',
        referer: 'http://www.google.com'
      },
      methodRewriting: false,
      dnsLookupIpVersion: undefined,
      parseJson: [Function: parse],
      stringifyJson: [Function: stringify],
      retry: {
        limit: 0,
        methods: [ 'GET', 'PUT', 'HEAD', 'DELETE', 'OPTIONS', 'TRACE' ],
        statusCodes: [
          408, 413, 429, 500,
          502, 503, 504, 521,
          522, 524
        ],
        errorCodes: [
          'ETIMEDOUT',
          'ECONNRESET',
          'EADDRINUSE',
          'ECONNREFUSED',
          'EPIPE',
          'ENOTFOUND',
          'ENETUNREACH',
          'EAI_AGAIN'
        ],
        maxRetryAfter: undefined,
        calculateDelay: [Function: calculateDelay],
        backoffLimit: Infinity,
        noise: 100
      },
      localAddress: undefined,
      method: 'GET',
      createConnection: undefined,
      cacheOptions: {
        shared: undefined,
        cacheHeuristic: undefined,
        immutableMinTimeToLive: undefined,
        ignoreCargoCult: undefined
      },
      https: {
        alpnProtocols: undefined,
        rejectUnauthorized: false,
        checkServerIdentity: undefined,
        certificateAuthority: undefined,
        key: undefined,
        certificate: undefined,
        passphrase: undefined,
        pfx: undefined,
        ciphers: 'TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-SHA:ECDHE-RSA-AES256-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA:AES256-SHA',
        honorCipherOrder: undefined,
        minVersion: 'TLSv1',
        maxVersion: 'TLSv1.3',
        signatureAlgorithms: 'ecdsa_secp256r1_sha256:rsa_pss_rsae_sha256:rsa_pkcs1_sha256:ecdsa_secp384r1_sha384:rsa_pss_rsae_sha384:rsa_pkcs1_sha384:rsa_pss_rsae_sha512:rsa_pkcs1_sha512',
        tlsSessionLifetime: undefined,
        dhparam: undefined,
        ecdhCurve: 'X25519:prime256v1:secp384r1',
        certificateRevocationLists: undefined
      },
      encoding: undefined,
      resolveBodyOnly: false,
      isStream: false,
      responseType: 'text',
      url: <ref *1> URL {
        [Symbol(context)]: URLContext {
          flags: 912,
          scheme: 'https:',
          username: '',
          password: '',
          host: 'www.google.com',
          port: null,
          path: [ '' ],
          query: 'gws_rd=ssl',
          fragment: null
        },
        [Symbol(query)]: URLSearchParams {
          [Symbol(query)]: [ 'gws_rd', 'ssl' ],
          [Symbol(context)]: [Circular *1]
        }
      },
      pagination: {
        transform: [Function: transform],
        paginate: [Function: paginate],
        filter: [Function: filter],
        shouldContinue: [Function: shouldContinue],
        countLimit: Infinity,
        backoff: 0,
        requestLimit: 10000,
        stackAllItems: false
      },
      setHost: true,
      maxHeaderSize: undefined
    },
    _merging: false,
    _init: [
      {
        handlers: [ [Function: fixDecompress] ],
        mutableDefaults: true,
        http2: true,
        https: { rejectUnauthorized: false },
        throwHttpErrors: false,
        timeout: { request: 60000 },
        retry: { limit: 0 },
        headers: { 'user-agent': undefined },
        context: {
          headerGenerator: HeaderGenerator {
            globalOptions: [Object],
            browserListQuery: undefined,
            inputGeneratorNetwork: [BayesianNetwork],
            headerGeneratorNetwork: [BayesianNetwork],
            uniqueBrowsers: [Array]
          },
          useHeaderGenerator: true,
          insecureHTTPParser: true
        },
        agent: {
          http: TransformHeadersAgent { agent: [Agent] },
          https: TransformHeadersAgent { agent: [Agent] }
        },
        hooks: {
          init: [
            [Function: optionsValidationHandler],
            [Function: customOptionsHook]
          ],
          beforeRequest: [
            [Function: insecureParserHook],
            [Function: sessionDataHook],
            [Function: http2Hook],
            [AsyncFunction: proxyHook],
            [AsyncFunction: browserHeadersHook],
            [Function: tlsHook]
          ],
          beforeRedirect: [ [Function: refererHook] ]
        }
      },
      { method: 'get' }
    ]
  }
}
szmarczak commented 2 years ago

Not sure if the connection header is removed when redirecting, will check

meotimdihia commented 1 year ago

I am getting this error too.

barjin commented 11 months ago

This shouldn't occur anymore with header-generator@2.1.35 and higher. Thanks for the patience!

barjin commented 11 months ago

This gets reintroduced with header-generator@2.1.36, as the header-generator changes decreased anti-blocking performance. I will investigate this and use this issue for tracking.