tensorflow / tfjs

A WebGL accelerated JavaScript library for training and deploying ML models.
https://js.tensorflow.org
Apache License 2.0
18.43k stars 1.92k forks source link

tf-node-gpu: "Unknown dtype: undefined", works with CPU-only tfjs-node #1914

Closed botic closed 5 years ago

botic commented 5 years ago

TensorFlow.js version

1.2.7

Describe the problem or feature request

fitDataset() works fine with the CPU-only training, but fails with an error if I switch to the GPU-based import. I work with a deeplearning optimized VM with CUDA 10 provided by GCP. As far as I can tell something goes wrong in the layer creation where the dtype suddenly is undefined in node-gpu.

Code to reproduce the bug / link to feature request

JS Code:

You can find the full code in my Github repo. Here the core part:

const fs = require("fs");
const tf = require(`@tensorflow/tfjs-node${process.env.TFJS_GPU === "supported" ? "-gpu" : ""}`);

const { convertCsvRecord } = require("./features");

module.exports = async function(datasetURL, outputURL) {
    const trainingSet = tf.data.csv(datasetURL.href).map(obj => convertCsvRecord(obj));

    const model = tf.sequential();
    model.add(tf.layers.dense({units: 50, activation: "relu", inputShape: [48]}));
    model.add(tf.layers.dense({units: 100, activation: "relu"}));
    model.add(tf.layers.dense({units: 4, activation: "softmax"}));

    model.compile({
        optimizer: tf.train.rmsprop(0.001),
        loss: tf.losses.meanSquaredError,
        metrics: ["accuracy"]
    });

    await model.fitDataset(trainingSet, {
        epochs: 50,
        batchSize: 512,
        shuffle: true
    });
};

Output:

2019-08-25 20:26:40.878677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 14926 MB memory) -> physical GPU (device: 2, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:06.0, compute capability: 7.0)
2019-08-25 20:26:40.878724: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-08-25 20:26:40.879483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 14926 MB memory) -> physical GPU (device: 3, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:07.0, compute capability: 7.0)
tensorflow backend was already registered. Reusing existing backend factory.
Epoch 1 / 50
(node:9073) UnhandledPromiseRejectionWarning: Error: Unknown dtype: undefined
    at getTFDType (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/dist/ops/op_utils.js:53:19)
    at Object.createTypeOpAttr (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/dist/ops/op_utils.js:65:16)
    at NodeJSKernelBackend.batchMatMul (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/dist/nodejs_kernel_backend.js:340:24)
    at NodeJSKernelBackend.fusedBatchMatMul (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/dist/nodejs_kernel_backend.js:370:27)
    at /usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/ops/fused_ops.js:150:25
    at /usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/engine.js:405:26
    at Engine.scopedRun (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/engine.js:359:23)
    at Engine.runKernel (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/engine.js:403:14)
    at matMul_ (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/ops/fused_ops.js:149:31)
    at Object.matMul (/usr/local/node-apps/smai/prediction/bike-tfjs/node_modules/@tensorflow/tfjs-node-gpu/node_modules/@tensorflow/tfjs-core/dist/ops/operation.js:46:29)
kangyizhang commented 5 years ago

@botic can you try with tfjs-node-gpu@1.2.8? I can reproduce the issue in 1.2.7 and it should have been fixed in 1.2.8

botic commented 5 years ago

Thanks, it's working with 1.2.8 now.