tensorflow / tfjs

A WebGL accelerated JavaScript library for training and deploying ML models.
https://js.tensorflow.org
Apache License 2.0
18.51k stars 1.93k forks source link

wasm backend produces incorrect results for int32 tensors #5641

Closed vladmandic closed 3 years ago

vladmandic commented 3 years ago

i was wondering why my app produces results with lower precision when using wasm backend and finally narrowed it down to wasm handling of int32 tensors

environment: tfjs 3.9.0 on ubuntu 21.04 note: ive tried wasm with and without simd support

below is a very simple reproduction
(and i suggest to add something like this to automated tests in the future)

note that tf.sum() is just easiest way to reproduce, this issue is accross the board in tfjs

const data = Array.from(imageData.data); // in my case data is imageData array so each value is just 0..255, but can be any dataset
data.length = 1024 * 1024 * 4; // lets crop array to specific size, just for test
let sumJS = 0;
for (let i = 0; i < data.length; i++) sumJS += data[i];
const tI32 = tf.tensor(data, [data.length], 'int32');
const tF32 = tf.tensor(data, [data.length], 'float32');
console.log({ backend: tf.getBackend(), arrayLength: data.length, jsSum: sumJS, tfSumI32: tf.sum(tI32).dataSync()[0], tfSumF32: tf.sum(tF32).dataSync()[0] });

this is ok output when using tfjs-node as both js and tf produce same sum:

{
  backend: 'tensorflow',
  arrayLength: 4194304,
  jsSum: 1000114465,
  tfSumI32: 1000114465,
  tfSumF32: 1000114432 // sum of float is slightly off, but close
}

but when using wasm backend, its completely broken:

{
  backend: 'wasm',
  arrayLength: 4194304,
  jsSum: 1000114465,
  tfSumI32: 66326844, // completely broken
  tfSumF32: 1023692544 // slightly off as well, but close
}

and this is not a case of int32 overflow as its reproducible with much lower values as well: (it seems that error occur with tensors with more than 64k values)

{
  backend: 'wasm',
  arrayLength: 65536,
  jsSum: 16042946,
  tfSumI32: 16042946, // this is correct
  tfSumF32: 16042946
}
{
  backend: 'wasm',
  arrayLength: 131072,
  jsSum: 31840717,
  tfSumI32: 24302074, // this is incorrect!!!
  tfSumF32: 31826932
}

and those numbers are faaaar off any int32 limits

and yes, i've checked that tensors get created correctly in both cases by downloading all values from it using dataSync() and comparing to original array

vladmandic commented 3 years ago

one more trivial reproduction that shows exactly when errors starts to occure:

const fs = require('fs');
const tf = require('@tensorflow/tfjs');
const wasm = require('@tensorflow/tfjs-backend-wasm');

async function main() {
  wasm.setWasmPaths('node_modules/@tensorflow/tfjs-backend-wasm/dist/');
  await tf.setBackend('wasm');
  await tf.ready();
  console.log('tfjs:', { version: tf.version_core, backend: tf.getBackend() });
  const t = {};
  const data = fs.readFileSync('dist/tfjs.esm.js.map');
  for (let i = 0; i <= 22; i++) {
    const arr = Array.from(data);
    const size = 2 ** i;
    arr.length = size;
    t.i32 = tf.tensor(arr, [size], 'int32');
    t.f32 = tf.tensor(arr, [size], 'float32');
    t.sumI = tf.sum(t.i32);
    t.sumF = tf.sum(t.f32);
    const JS = arr.reduce((prev, curr) => prev += curr, 0);
    const I32 = t.sumI.dataSync()[0];
    const F32 = t.sumF.dataSync()[0];
    console.log({ size, JS, I32, F32, ok: JS === I32 });
    Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
  }
}

main();

output:

tfjs: { version: '3.9.0', backend: 'wasm' }
{ size: 1, JS: 123, I32: 123, F32: 123, ok: true }
{ size: 2, JS: 133, I32: 133, F32: 133, ok: true }
{ size: 4, JS: 197, I32: 197, F32: 197, ok: true }
{ size: 8, JS: 564, I32: 564, F32: 564, ok: true }
{ size: 16, JS: 1180, I32: 1180, F32: 1180, ok: true }
{ size: 32, JS: 2319, I32: 2319, F32: 2319, ok: true }
{ size: 64, JS: 5041, I32: 5041, F32: 5041, ok: true }
{ size: 128, JS: 10828, I32: 10828, F32: 10828, ok: true }
{ size: 256, JS: 22156, I32: 22156, F32: 22156, ok: true }
{ size: 512, JS: 45456, I32: 45456, F32: 45456, ok: true }
{ size: 1024, JS: 91536, I32: 91536, F32: 91536, ok: true }
{ size: 2048, JS: 184851, I32: 184851, F32: 184851, ok: true }
{ size: 4096, JS: 371489, I32: 371489, F32: 371489, ok: true }
{ size: 8192, JS: 742567, I32: 742567, F32: 742567, ok: true }
{ size: 16384, JS: 1486349, I32: 1486349, F32: 1486349, ok: true }
{ size: 32768, JS: 2999662, I32: 2999662, F32: 2999662, ok: true }
{ size: 65536, JS: 6039441, I32: 6039441, F32: 6039441, ok: true }
{ size: 131072, JS: 12112329, I32: 12112329, F32: 12112329, ok: true }
{ size: 262144, JS: 23178955, I32: 19976980, F32: 23176744, ok: false }
{ size: 524288, JS: 46226017, I32: 28329405, F32: 46208756, ok: false }
{ size: 1048576, JS: 91071178, I32: 36552479, F32: 91093240, ok: false }
{ size: 2097152, JS: 180992048, I32: 44872176, F32: 181083904, ok: false }
{ size: 4194304, JS: 358417967, I32: 53091244, F32: 356742528, ok: false }
google-ml-butler[bot] commented 3 years ago

Are you satisfied with the resolution of your issue? Yes No

vladmandic commented 3 years ago

@jinjingforever there's a leftover line in the fix:

console.log('reduceshape', reduceShape);`