ladjs / superagent

Ajax for Node.js and browsers (JS HTTP client). Maintained for @forwardemail, @ladjs, @spamscanner, @breejs, @cabinjs, and @lassjs.
https://ladjs.github.io/superagent/
MIT License
16.59k stars 1.33k forks source link

Piping data but only on 200 OK #1575

Open kousu opened 4 years ago

kousu commented 4 years ago

I want to download/upload large files with superagent. .pipe() conveniently streams data, but it inconveniently ignores all HTTP errors. I want to separate HTTP 404, 500, 403, etc as errors to stderr instead of writing their error pages out as if they were my content.

I'm having trouble wrapping my head around why this isn't allowed:

// get.js
let request = require('superagent');

var req = request.get(process.argv[2])
req.buffer(false)
req.then(res => {
   console.log(res.status)
   res.pipe(process.stdout)
 })
 .catch(err => {
   console.log('error!')
   console.log(err.status, err.message)
 })
$ node get.js https://google.com
(node:44322) [DEP0066] DeprecationWarning: OutgoingMessage.prototype._headers is deprecated
(Use `node --trace-deprecation ...` to show where the warning was created)
200
error!
undefined .end() was called twice. This is not supported in superagent
$ node get.js https://google.com/404
error!
404 Not Found

I get the exact same output if I use req as suggested in https://github.com/visionmedia/superagent/issues/1188 (as suggested in the docs):

 req.buffer(false)
 req.then(res => {
    console.log(res.status, res.message)
-   res.pipe(process.stdout)
+   req.pipe(process.stdout)
  })
  .catch(err => {
    console.log('error!')

I understand that .pipe() and .then() are incompatible because .then() triggers the parsing system (https://github.com/visionmedia/superagent/issues/1187#issuecomment-281995106) -- but https://github.com/visionmedia/superagent/issues/1187#issuecomment-281995580 makes it sound like .buffer(false) should sidestep the parsing system.

I am wondering how to download data in a stream without buffering it all while also looking at the headers.

There is this comment https://github.com/visionmedia/superagent/issues/1187#issuecomment-281995580

It is in a weird place that only your parser sees. #950

but I'm having trouble understanding that. https://github.com/visionmedia/superagent/issues/950#issuecomment-282112032 offers

I think buffered responses should be the default, and unbuffered responses should be only opt-in.

but that doesn't explain to me how to get the body content.


I'm trying to achieve what I can do in python-requests: https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests/16696317#16696317

def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename

r.raise_for_status() checks r.status -- which it has because at that point it has parsed the HTTP headers -- and throws an exception on non-200s. But the rest of the data hasn't been read off the socket at that point so it won't crash on e.g. 2GB files.

--

I'm putting this out there in case you, or any user of the library, has a better clue how to do this than I've found so far. I don't expect you to go out of your way to solve my problem for me. Thanks for your work on superagent, and I hope you are somewhere with friends and family during the lockdowns.

kousu commented 4 years ago

I found a solution. I read https://github.com/visionmedia/superagent/blob/2fcea621c69e3cc779bc59f5f3e8677c2cce8f99/src/node/parsers/text.js and understood more clearly what the "weird place" comment meant: it's a callback attached to the response object: res.on('data'):

// get.js
let request = require('superagent');

let pipe = (source, sink) => {
   source.on('data', chunk => {
     console.error('got some data of len ', chunk.length); // DEBUG 
     sink.write(chunk)
  })
} 

var req = request.get(process.argv[2])
req.buffer(false)
req.then(res => {
   console.error(res.status)
   pipe(res, process.stdout)
   //res.pipe(process.stdout) // nope. not this one.
 })
 .catch(err => {
   console.error('error!')
   console.error(err.status, err.message)
 })
$ node get.js https://example.net
200
<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
    <p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
    <p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
$ node get.js https://example.net/404
error!
404 Not Found

It would be great to know if this is actually the officially blessed way to do this. It would also be nice if my pipe could be adopted as res.pipe() if that makes sense to you, because (#1188) lots of peoples' intuitions, including mine, think it should work like that. And could .pipe() behave like .then() in that HTTP errors are shunted to exceptions?


I know it's a bit late, but I wonder if this would help any of you

Did any of you come up with another solution?

kousu commented 4 years ago

Nevermind, my solution doesn't work. It works on some files, but if Content-Type: application/octet-stream (or presumably one of the other types in:

https://github.com/visionmedia/superagent/blob/2fcea621c69e3cc779bc59f5f3e8677c2cce8f99/src/node/parsers/index.js#L1-L9

)

then my source.on('data', ...) is ignored and nothing comes out. I expected setting .buffer(false) would overrule the mimetype detection; but it doesn't, in fact the mimetype detection overrules it at parser = exports.parse[mime]:

https://github.com/visionmedia/superagent/blob/2fcea621c69e3cc779bc59f5f3e8677c2cce8f99/src/node/index.js#L1055-L1082

I tried to circumvent this by setting an explicit parser that did nothing:

req.parse( (res,cb) => null ) 

but this didn't work and I don't know why. The mimetype detection is strong and works even when I work against it.


Anyway, I figured out what the reliable thing to do is. As mentioned https://github.com/visionmedia/superagent/issues/1161#issuecomment-361930999 and @julien-f you can replace .then() with req.on('response') to catch the response after the headers are parsed but before the body is processed ((but this only works if you also set .buffer(false))) and in there you can use .on('data') as above; unfortunately, this event happens before the automatic error-checking kicks in, but I dug that out and patched it in too:

// get.js
let request = require('superagent');

let pipe = (source, sink) => {
   source.on('data', chunk => {
     console.error('got some data of len ', chunk.length); // DEBUG
     sink.write(chunk)
   })
}

var req = request.get(process.argv[2])
req.buffer(false)
req.on('response', res => {
  if(request.Request.prototype._isResponseOK(res)) { // this helper is in a funny place
     console.error(res.status)
     pipe(res, process.stdout)
  }
  })
 .catch(err => {
   console.error('error!')
   console.error(err.status, err.message)
  })
kousu commented 4 years ago

Third time's the charm: it turns out the last version works perfectly, except for application/json, because of this hardcoded exception just for JSON:

https://github.com/visionmedia/superagent/blob/2fcea621c69e3cc779bc59f5f3e8677c2cce8f99/src/node/index.js#L1084-L1087

This circumvents that and is just as good:

// get.js
let request = require('superagent');

let pipe = (source, sink) => {
   source.on('data', chunk => {
     console.error('got some data of len ', chunk.length); // DEBUG
     sink.write(chunk)
   })
}

var req = request.get(process.argv[2])
req.buffer(false)
req.parse((res, cb) => {
  if(res.statusCode == 200) {
     console.error(res.status)
     pipe(res, process.stdout)
     res.on('end', () => { cb(null, undefined, null) })
  }
})
req.on('response', res => {
  })
 .catch(err => {
   console.error('error!')
   console.error(err.status, err.message)
  })
kousu commented 4 years ago

Here are some test cases you can try these with:

node get.js https://openneuro.org/crn/datasets/ds000001/files/README # 200 OK, application/octet-stream

node get.js https://example.org # 200 OK, text/html

node get.js https://example.org/404 # 404 Not Found

node get.js https://openneuro.org/crn/datasets/ds000001/files/sub-16:anat:sub-16_inplaneT2.nii.gz # 200 OK, application/gzip

node get.js https://openneuro.org/crn/datasets/ds000001/files/ldsfosdjfisdjfiujsdiufj # 200 OK, application/octet-stream
 # `{"error": "file not found"}`

node get.js https://openneuro.org/crn/datasets/ds000001/files/dataset_description.json # 200 OK, application/json

node get.js https://openneuro.org/crn/ # 200 OK, text/plain

The first version produces no output on the application/octet-stream, text/plain or application/json cases, and on the application/gzip case it produces output but it's pre-unzipped, which is not what I wanted for my application. The second version fixes application/octet-stream and application/gzip. The third one covers them all, and fingers-crossed that's all of them there are to cover.

kousu commented 4 years ago

It would be easier for this use case if the decision between .pipe()/.parse() could happen after parsing headers. Request._parser isn't invoked until it gets to the body content anyway and after the headers are already parsed.