watson-developer-cloud / node-sdk

:comet: Node.js library to access IBM Watson services.
https://www.npmjs.com/package/ibm-watson
Apache License 2.0
1.48k stars 669 forks source link

[Discovery] Error when uploading a JSON document held in a Buffer #474

Closed buteomont closed 7 years ago

buteomont commented 7 years ago

The following code demonstrates the problem:

'use strict';
var extend=require('util')._extend;
var fs=require('fs');
var watson=require('watson-developer-cloud'); 
var DiscoveryV1 = require('watson-developer-cloud/discovery/v1');
var discovery = new DiscoveryV1({
  username: '(redacted)',
  password: '(redacted)',
  version_date: DiscoveryV1.VERSION_DATE_2016_12_15
});
var envID= "(redacted)";
var configID= "(redacted)";
var collectionID="(redacted)";

var inputDir = './testdocs';
console.log('input dir: ' + inputDir);
var files = getFiles(inputDir);
console.log('returned files: ' + files.length);
for (var i=0; i<3; i++) {
  if(/^\..*/.test(files[i])) {
     continue;
  }
  var fileName = inputDir + '/' + files[i];
  console.log('i: ' + i + 'adding file: ' + fileName);
  var fileVal = new Buffer(fs.readFileSync(fileName));

  var params = {
        "environment_id": envID,
        "collection_id": collectionID,
        "file": fileVal,
        "metadata": {'action': 'testing'}
      };
  console.log('params: ' + JSON.stringify(params));
  discovery.addDocument(params, function(error,results) {
        if(error) {
          console.log('error adding document: ' + JSON.stringify(error));
        } else {
          console.log('file added successfully');
        }
  });
}
function getFiles(dir)
   {
   var allfiles=fs.readdirSync(dir);
   console.log('num files: ' + allfiles.length);
   return allfiles;
   }

I expect this code to read the JSON document, store it in a Buffer, and then upload the buffer to the Discovery service. Instead, it returns this error:

/home/david/git/ccb2-contentbridge/node_modules/request/node_modules/combined-stream/node_modules/delayed-stream/lib/delayed_stream.js:33
  source.on('error', function() {});
         ^

TypeError: source.on is not a function
    at Function.DelayedStream.create (/home/david/git/ccb2-contentbridge/node_modules/request/node_modules/combined-stream/node_modules/delayed-stream/lib/delayed_stream.js:33:10)
    at FormData.CombinedStream.append (/home/david/git/ccb2-contentbridge/node_modules/request/node_modules/combined-stream/lib/combined_stream.js:43:37)
    at FormData.append (/home/david/git/ccb2-contentbridge/node_modules/request/node_modules/form-data/lib/form_data.js:68:3)
    at Request.init.appendFormValue (/home/david/git/ccb2-contentbridge/node_modules/request/request.js:326:21)
    at Request.init (/home/david/git/ccb2-contentbridge/node_modules/request/request.js:337:11)
    at new Request (/home/david/git/ccb2-contentbridge/node_modules/request/request.js:130:8)
    at request (/home/david/git/ccb2-contentbridge/node_modules/request/index.js:54:10)
    at createRequest (/home/david/git/ccb2-contentbridge/node_modules/watson-developer-cloud/lib/requestwrapper.js:173:10)
    at DiscoveryV1.addDocument (/home/david/git/ccb2-contentbridge/node_modules/watson-developer-cloud/discovery/v1.js:339:10)
    at Object.<anonymous> (/home/david/git/ccb2-contentbridge/test.js:50:13)

Node version 4.1.0 SDK version 2.32.1

nfriedly commented 7 years ago

Hum, that's odd, it seems to think it's a stream for some reason.

I'll take a closer look as soon as I get a chance, but I think this may work as a temporary workaround:

  var params = {
        "environment_id": envID,
        "collection_id": collectionID,
        // wrap fileValue in an object and give it an arbitrary filename
        "file": { value: fileVal, options: { filename: '_' } },
        "metadata": {'action': 'testing'}
      };

Can you try it and let me know?

buteomont commented 7 years ago

Same exact error.

Thanks for looking into this.

On 06/27/2017 02:48 PM, Nathan Friedly wrote:

Hum, that's odd, it seems to think it's a stream for some reason.

I'll take a closer look as soon as I get a chance, but I think this may work as a temporary workaround:

var params= { "environment_id": envID, "collectionid": collectionID, // wrap fileValue in an object and give it an arbitrary filename "file": { value: fileVal, options: { filename: '' } }, "metadata": {'action': 'testing'} };

Can you try it and let me know?

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/watson-developer-cloud/node-sdk/issues/474#issuecomment-311465332, or mute the thread https://github.com/notifications/unsubscribe-auth/AB0bVzC-E_cFiQsNhlDjWZoYhOyxxan8ks5sIVyKgaJpZM4OHFGp.

nfriedly commented 7 years ago

Darn. Ok, it looks like @mamoonraja is going to work on this, probably sooner than I'll be able to.

mamoonraja commented 7 years ago

Can you please use fs.createReadStream instead of fs.readFileSync. Try something like this:

var uploadedFiles = [];
fs.createReadStream(dir)
  .on('entry', function(fileObj) {
    if (uploadedFiles.length < 3 && fileObj.type == 'File') {
      var params = {
        "environment_id": envID,
        "collection_id": collectionID,
        "file": fileObj,
        "metadata": { 'action': 'testing' }
      };
      uploadedFiles.push(fileObj.path);
      discovery.addDocument(params, function(error, results) {
        if (error) {
          console.log('error adding document: ' + JSON.stringify(error));
        } else {
          console.log('file added successfully');
        }
      });
    }
  })

And also make sure you are using the latest version of SDK.

buteomont commented 7 years ago

That would invalidate my use case. What I posted is just a test program that demonstrates the problem. In the actual application that I am developing, the file system is not even in the picture - I am retrieving JSON documents from a Cloudant database and ingesting them directly into Discovery.

As stated earlier in the thread, I am using version 2.32.1 of the SDK.

On 06/27/2017 05:26 PM, Mamoon Raja wrote:

Can you please use |fs.createReadStream| instead of |fs.readFileSync|. Try something like this:

var uploadedFiles= []; fs.createReadStream(dir) .on('entry',function(fileObj) { if (uploadedFiles.length < 3 && fileObj.type == 'File') { var params= { "environment_id": envID, "collection_id": collectionID, "file": fileObj, "metadata": {'action': 'testing' } }; uploadedFiles.push(fileObj.path); discovery.addDocument(params,function(error,results) { if (error) { console.log('error adding document: ' + JSON.stringify(error)); }else { console.log('file added successfully'); } }); } })```

And also make sure you are using the latest version of SDK.

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/watson-developer-cloud/node-sdk/issues/474#issuecomment-311503869, or mute the thread https://github.com/notifications/unsubscribe-auth/AB0bV2IhUoygI7ydBwShO8URbneoemiAks5sIYGegaJpZM4OHFGp.

germanattanasio commented 7 years ago

@buteomont did you try converting the string into a stream. Something like https://stackoverflow.com/questions/12755997/how-to-create-streams-from-string-in-node-js

buteomont commented 7 years ago

Thanks @germanattanasio, that was a good find. Unfortunately it still gives the same error. Here is what I changed:

var Readable = require('stream').Readable;
var s = new Readable;
s.push(fileVal);
s.push(null);      // indicates end-of-file basically - the end of the stream
var params = {
        "environment_id": envID,
        "collection_id": collectionID,
        // wrap fileValue in an object and give it an arbitrary filename
        "file": { value: s, options: { filename: 'dummy.json' } },
        "metadata": {'action': 'testing'}
      };

Same source.on error.

germanattanasio commented 7 years ago

Try with

var Readable = require('stream').Readable;
var s = new Readable;
s.push(fileVal);
s.push(null);
var params = {
  "environment_id": envID,
  "collection_id": collectionID,
  "file": s,
  "metadata": {'action': 'testing'}
};
buteomont commented 7 years ago

Same error.

On 06/27/2017 05:52 PM, German Attanasio wrote:

Try with

var Readable= require('stream').Readable; var s= new Readable; s.push(fileVal); s.push(null); var params= { "environment_id": envID, "collection_id": collectionID, "file": s, "metadata": {'action': 'testing'} };

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/watson-developer-cloud/node-sdk/issues/474#issuecomment-311508281, or mute the thread https://github.com/notifications/unsubscribe-auth/AB0bV9NFuQNr0r9xTRG0mw5_fUYLlBa6ks5sIYefgaJpZM4OHFGp.

GwilymNewton commented 7 years ago

I am also hitting this error bug, my code is as follows:

      var file = JSON.stringify(msg.payload);

      console.log("File", file);

      var document_obj = {
        environment_id: environment,
        collection_id: collection,
        file: file,
        metadata: {
          "content-type": "application/json"
        }
      };

      console.log("About to send to discovery");
      try {
        discovery.addDocument(document_obj, function (err, response) {
          console.log("We made it here");
          if (err) {
            console.error(err);
          } else {
            console.log(JSON.stringify(response, null, 2));
          }
        });
      } catch (e) {
        console.log(e);
      }

Stack Trace:

about to send to discovery
TypeError: source.on is not a function
    at Function.DelayedStream.create (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/delayed-stream/lib/delayed_stream.js:33:10)
    at FormData.CombinedStream.append (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/combined-stream/lib/combined_stream.js:43:37)
    at FormData.append (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/form-data/lib/form_data.js:68:3)
    at appendFormValue (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/request/request.js:326:21)
    at Request.init (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/request/request.js:337:11)
    at new Request (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/request/request.js:130:8)
    at request (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/request/index.js:54:10)
    at createRequest (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/watson-developer-cloud/lib/requestwrapper.js:174:10)
    at DiscoveryV1.addDocument (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/watson-developer-cloud/discovery/v1.js:433:10)
    at DiscoveryInsert.<anonymous> (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/discovery-insert/insert.js:62:19)
GwilymNewton commented 7 years ago

The workarounds suggested for @buteomont also don'nt work for me, I am internal to IBM so feel free to message me via verse.

My use case also requires bypassing the file system, I have a stream of JSON objects arriving every second into my system, which I want to post to discovery.

chughts commented 7 years ago

I don't think that this is the appropriate forum for this question as it is not identifying a bug or a feature request for the SDK. This problem is related to how to create readstreams from file buffer in Node.js, consequently the appropriate forum is Stack Overflow, where there are hundreds of skilled Node.js practitioners that will be able to solve this.

GwilymNewton commented 7 years ago

@chughts I aware this not the appropriate forum for asking a question.

I already have established how to do what I want, however the code is failing, and believe the reason is the same bug @buteomont is hitting. As hopefully you can see from my stack trace.

I am explaining my usecase as it appears to quite similar to @buteomont's

nfriedly commented 7 years ago

I'm looking at this now. I've replicated the bug Node v4.1.0 and also on v6.10.3 and v8.1.3 - so it's not related to the Node.js version.

nfriedly commented 7 years ago

Arg, it was the metadata. It's expecting a string, but when you pass in an object that isn't a Buffer, the underlying form-data library apparently assumes it's a Stream :/

Calling JSON.stringify() on it before passing it to addDocument() makes everything work.

The JSDoc does specify that it's expecting a string here, but IMHO that's a bad API. I'm going to update it to also accept objects, and then your example should work.

nfriedly commented 7 years ago

Released in v2.33.0

GwilymNewton commented 7 years ago

@nfriedly My code is not working with it.

      var file = JSON.stringify(msg.payload);

      console.log("File", file);

      var document_obj = {
        environment_id: environment,
        collection_id: collection,
        file:file,
        metadata: {
          "content-type": "application/json"
        }
      };

      console.log("About to send to discovery");
      try {
        discovery.addDocument(document_obj, function (err, response) {
          console.log("We made it here");
          if (err) {
            console.error(err);
          } else {
            console.log(JSON.stringify(response, null, 2));
          }
        });
      } catch (e) {
        console.log(e);
      }

It appears to ignore the metadata now, and thus refuses to accept my json content as it thinks it plain text.

File {"test_attiribute":"test data"} About to send to discovery We made it here { Error: The Media Type [text/plain] of the input document is not supported. Auto correction was attempted, but the auto detected media type [text/plain] is also not supported. Supported Media Types are: application/json, application/msword, application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/pdf, text/html, application/xhtml+xml . at Request._callback (/Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert/node_modules/watson-developer-cloud/lib/requestwrapper.js:87:15)

I have updated: gwilyms-mbp:node-red-contrib-watson-discovery-insert gwilymnewton$ npm list | grep watson node-red-contrib-discovery-insert@0.0.1 /Users/gwilymnewton/Box Sync/wells Fargo Center/repos/node-red-contrib-watson-discovery-insert └─┬ watson-developer-cloud@2.33.0

Node Version gwilyms-mbp:node-red-contrib-watson-discovery-insert gwilymnewton$ node -v v6.10.3

nfriedly commented 7 years ago

I've figured out a few things:

  1. Discovery only looks at file extension and file content; it ignores content-type, and it completely ignores files that don't have a filename.

  2. The SDK has code to ensure there's always a filename set, but it sets it to _ (no extension)

  3. When Discovery sees JSON content without a .json extension on the filename, it assumes it's plain text.

  4. There's a bug in the SDK that causes it to choke if you set your own filename in a wrapper object around the file :/

So, I'm going to fix 4 right away, and then look into how to work around 3.

GwilymNewton commented 7 years ago

Okay, could a temporary work around for 3. be to set a filename like "_.json"?

nfriedly commented 7 years ago

Not yet, but I'm trying to get a release out today that fixes 4, and then something like this should work:

var file = JSON.stringify(msg.payload);

var document_obj = {
  environment_id: environment,
  collection_id: collection,
  file:  {
    value: file,
    options: {
      filename: "_.json"
    }
  }
};

discovery.addDocument(document_obj, function (err, response) {
  if (err) {
    console.error(err);
  } else {
    console.log(JSON.stringify(response, null, 2));
  }
});

But, again, it won't work until I get the fix out.

nfriedly commented 7 years ago

Ok, I just released v3.34.0, which fixes the bug so that the above example should work, and also adds a new addJsonDocument method to make uploading in-memory JSON docs a little more straightforward:

var document_obj = {
  environment_id: environment,
  collection_id: collection,
  file: msg.payload
};

discovery.addJsonDocument(document_obj, function (err, response) {
  if (err) {
    console.error(err);
  } else {
    console.log(JSON.stringify(response, null, 2));
  }
});

I also added about 8 new tests and enabled about a dozen others that had been written before the service was actually released :/

So, this should make things a little easier for your use-case and generally more reliable.

@GwilymNewton @buteomont Please test the new release and let me know if it's working for you now.

GwilymNewton commented 7 years ago
{
  "document_id": "57895395-e445-45c5-a19e-f8f3aaxxxxx",
  "status": "processing"
}

@nfriedly its working, thanks very much 👍 👍

buteomont commented 7 years ago

Works a treat for me too, @nfriedly . Thanks! (btw, I believe you meant version 2.34.0, not 3.34.0)

evenfrost commented 7 years ago

Thanks a lot, been struggling with transforming json string to Stream for a while, somehow it rejected to work. Would be super-great if this could be added to documentation as well. 👍

evenfrost commented 7 years ago

@nfriedly When using addJsonDocument method, all seems to upload fine (no error, object with document_id and processing status is returned), but when I check Discovery, it says that document upload failed and gives me following error:

_.json
Occurred during: ingestion
9/2/2017 2:34:21 pm EDT
Your request could not be processed because of a problem on the server.

Though when I upload a document with same fields and name (_.json) to Discovery directly (via file uploader), it proceeds correctly. Do you know what's wrong with my request?

evenfrost commented 7 years ago

Sorry for bothering, all good now, I was performing JSON.stringify on the file passed, which is not needed.

nfriedly commented 7 years ago

Oh, yea.. it wouldn't be a bad idea to make the SDK throw a more obvious error when passing a string to the addJsonDocument method. (Or, really, anything that fails typeof params.file !== "object".)

Would you like to send a PR for that?