apache / arrow

Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing
https://arrow.apache.org/
Apache License 2.0
14.36k stars 3.49k forks source link

[JS] tableFromJSON cannot handle nested objects containing strings #33394

Open asfimport opened 1 year ago

asfimport commented 1 year ago

$ node

const g = require('apache-arrow')

g.tableFromJSON([\{a: [ { b: "hi" } ]}])

 

The dictionary types:

 

TYPE Dictionary {indices: Int32, dictionary: Utf8, isOrdered: false, id: 12}dictionary: Utf8 {}id: 12indices: Int32 {isSigned: true, bitWidth: 32}isOrdered: falseArrayType: (...)children: (...)typeId: (...)valueType: (...)[[Prototype]]: Dictionary typecomparator.ts:191 OTHER 

 

OTHER Dictionary {indices: Int32, dictionary: Utf8, isOrdered: false, id: 14}dictionary: Utf8typeId: (...)[[Prototype]]: Utf8id: 14indices: Int32 {isSigned: true, bitWidth: 32}isOrdered: falseArrayType: (...)children: (...)typeId: (...)valueType: (...)[[Prototype]]: Dictionary

 

This happens here:

    else if (arraysCount + nullsCount === value.length) {         const array = value;         const childType = inferType(array[array.findIndex((ary) => ary != null)]);         if (array.every((ary) => ary == null || (0, typecomparator_js_1.compareTypes)(childType, inferType(ary)))) {             return new dtypes.List(new schema_js_1.Field('', childType, true));         }     }

 

So we're always instantiating a new dictionary type, with a new id, when we do inferType(ary), so this is never going to succeed.

Reporter: Samuel Schneck Assignee: Samuel Schneck

PRs and other links:

Note: This issue was originally created as ARROW-18208. Please see the migration documentation for further details.

asfimport commented 1 year ago

Samuel Schneck: Just to document my insanity here....


"use strict";
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
Object.defineProperty(exports, "__esModule", { value: true });
exports.builderThroughAsyncIterable = exports.builderThroughIterable = exports.tableFromJSON = exports.vectorFromArray = exports.makeBuilder = void 0;
const util = require("util")
const tslib_1 = require("tslib");
const schema_js_1 = require("./schema.js");
const dtypes = tslib_1.__importStar(require("./type.js"));
const data_js_1 = require("./data.js");
const vector_js_1 = require("./vector.js");
const builderctor_js_1 = require("./visitor/builderctor.js");
const table_js_1 = require("./table.js");
const recordbatch_js_1 = require("./recordbatch.js");
const typecomparator_js_1 = require("./visitor/typecomparator.js");
function makeBuilder(options) {
    const type = options.type;
    const builder = new (builderctor_js_1.instance.getVisitFn(type)())(options);
    if (type.children && type.children.length > 0) {
        const children = options['children'] || [];
        const defaultOptions = { 'nullValues': options['nullValues'] };
        const getChildOptions = Array.isArray(children)
            ? ((_, i) => children[i] || defaultOptions)
            : (({ name }) => children[name] || defaultOptions);
        for (const [index, field] of type.children.entries()) {
            const { type } = field;
            const opts = getChildOptions(field, index);
            builder.children.push(makeBuilder(Object.assign(Object.assign({}, opts), { type })));
        }
    }
    return builder;
}
exports.makeBuilder = makeBuilder;
function vectorFromArray(init, type) {
    if (init instanceof data_js_1.Data || init instanceof vector_js_1.Vector || init.type instanceof dtypes.DataType || ArrayBuffer.isView(init)) {
        return (0, vector_js_1.makeVector)(init);
    }
    const options = { type: type !== null && type !== void 0 ? type : inferType(init), nullValues: [null] };
    const chunks = [...builderThroughIterable(options)(init)];
    const vector = chunks.length === 1 ? chunks[0] : chunks.reduce((a, b) => a.concat(b));
    if (dtypes.DataType.isDictionary(vector.type)) {
        return vector.memoize();
    }
    return vector;
}
exports.vectorFromArray = vectorFromArray;
/**
 * Creates a {@link Table} from an array of objects.
 *
 * @param array A table of objects.
 */
function tableFromJSON(array) {
    console.log("HONK", JSON.stringify(array.slice(0, 1)))
    const vector = vectorFromArray(array);
    const batch = new recordbatch_js_1.RecordBatch(new schema_js_1.Schema(vector.type.children), vector.data[0]);
    return new table_js_1.Table(batch);
}
exports.tableFromJSON = tableFromJSON;function inferType(value, path, cache) {
    if (!path) {
      path = []
    }
  if (!cache) {
    cache = new Map()
  }
    console.log("INFERTRACE", value)
    if (value.length === 0) {
        return new dtypes.Null;
    }
    let nullsCount = 0;
    let arraysCount = 0;
    let objectsCount = 0;
    let numbersCount = 0;
    let stringsCount = 0;
    let bigintsCount = 0;
    let booleansCount = 0;
    let datesCount = 0;
    for (const val of value) {
        if (val == null) {
            ++nullsCount;
            continue;
        }
        switch (typeof val) {
            case 'bigint':
                ++bigintsCount;
                continue;
            case 'boolean':
                ++booleansCount;
                continue;
            case 'number':
                ++numbersCount;
                continue;
            case 'string':
                ++stringsCount;
                continue;
            case 'object':
                if (Array.isArray(val)) {
                    ++arraysCount;
                }
                else if (Object.prototype.toString.call(val) === '[object Date]') {
                    ++datesCount;
                }
                else {
                    ++objectsCount;
                }
                continue;
        }
        console.log(val)
        console.log("OH FUCK", typeof val)
        throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected');
    }    console.log("WE COUNTED")
    console.log("gonna hit the array case", arraysCount + nullsCount === value.length)
    console.log(value.length)
    console.log("nullsCount", nullsCount)
    console.log("arraysCount", arraysCount)
    console.log("objectsCount", objectsCount)
    console.log("numbersCount", numbersCount)
    console.log("stringsCount", stringsCount)
    console.log("bigintsCount", bigintsCount)
    console.log("booleansCount", booleansCount)
    console.log("datesCount", datesCount)
    if (numbersCount + nullsCount === value.length) {
        return new dtypes.Float64;
    }
    else if (stringsCount + nullsCount === value.length) {
        console.log('STRINGS CASE', value)
        if (cache.has(path)) {
          console.log("CACHED at path", path)
          return cache.get(path)
        }
        const d =  new dtypes.Dictionary(new dtypes.Utf8, new dtypes.Int32);
        cache.set(path, d)
        return d
    }
    else if (bigintsCount + nullsCount === value.length) {
        return new dtypes.Int64;
    }
    else if (booleansCount + nullsCount === value.length) {
        return new dtypes.Bool;
    }
    else if (datesCount + nullsCount === value.length) {
        return new dtypes.DateMillisecond;
    }
    else if ((() => {const res = arraysCount + nullsCount === value.length; console.log("EVALUATING ARRAY", res, value); return res})()) {
        console.log("arrayVal", value)
        const array = value;
        const childType = inferType(array[array.findIndex((ary) => ary != null)], path, cache);
        if (array.every((
          ary
        ) => ary == null || (() => {
            const c = typecomparator_js_1.compareTypes(childType, (
          () => {
            const res = (inferType(ary, path, cache)); console.log("COMPARING ARRAY TYPE",util.inspect(res));return res
          })())
          console.log("THE RESULT OF THE COMPARISON", c)
          return c
        })())) {
            console.log("THE TYPE WE COMPARE:",util.inspect( childType))
            return new dtypes.List(new schema_js_1.Field('', childType, true));
        }
        console.log("THE TYPE WE COMPARE WHEN IT ALL GOES WRONG:",util.inspect( childType))
      console.log("bailing out on", value)
    }
    else if (objectsCount + nullsCount === value.length) {
        const fields = new Map();
        for (const row of value) {
            for (const key of Object.keys(row)) {
                if (!fields.has(key) && row[key] != null) {
                    // use the type inferred for the first instance of a found key                    console.log(path)
                    path.push(key)
                    fields.set(key, new schema_js_1.Field(key, inferType([row[key]], path, cache), true));
                    path.pop(key)
                }
            }
        }
        return new dtypes.Struct([...fields.values()]);
    }    console.log("WE CRASHED on value", value)
    console.log("gonna hit the array case", arraysCount + nullsCount === value.length)
    console.log(value.length)
    console.log("nullsCount", nullsCount)
    console.log("arraysCount", arraysCount)
    console.log("objectsCount", objectsCount)
    console.log("numbersCount", numbersCount)
    console.log("stringsCount", stringsCount)
    console.log("bigintsCount", bigintsCount)
    console.log("booleansCount", booleansCount)
    console.log("datesCount", datesCount)    throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected');}
/**
 * Transform a synchronous `Iterable` of arbitrary JavaScript values into a
 * sequence of Arrow Vector<T> following the chunking semantics defined in
 * the supplied `options` argument.
 *
 * This function returns a function that accepts an `Iterable` of values to
 * transform. When called, this function returns an Iterator of `Vector<T>`.
 *
 * The resulting `Iterator<Vector<T>>` yields Vectors based on the
 * `queueingStrategy` and `highWaterMark` specified in the `options` argument.
 *
 * * If `queueingStrategy` is `"count"` (or omitted), The `Iterator<Vector<T>>`
 *   will flush the underlying `Builder` (and yield a new `Vector<T>`) once the
 *   Builder's `length` reaches or exceeds the supplied `highWaterMark`.
 * * If `queueingStrategy` is `"bytes"`, the `Iterator<Vector<T>>` will flush
 *   the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength`
 *   reaches or exceeds the supplied `highWaterMark`.
 *
 * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use.
 * @returns A function which accepts a JavaScript `Iterable` of values to
 *          write, and returns an `Iterator` that yields Vectors according
 *          to the chunking semantics defined in the `options` argument.
 * @nocollapse
 */
function builderThroughIterable(options) {
    const { ['queueingStrategy']: queueingStrategy = 'count' } = options;
    const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? Number.POSITIVE_INFINITY : Math.pow(2, 14) } = options;
    const sizeProperty = queueingStrategy !== 'bytes' ? 'length' : 'byteLength';
    return function* (source) {
        let numChunks = 0;
        const builder = makeBuilder(options);
        for (const value of source) {
            if (builder.append(value)[sizeProperty] >= highWaterMark) {
                ++numChunks && (yield builder.toVector());
            }
        }
        if (builder.finish().length > 0 || numChunks === 0) {
            yield builder.toVector();
        }
    };
}
exports.builderThroughIterable = builderThroughIterable;
/**
 * Transform an `AsyncIterable` of arbitrary JavaScript values into a
 * sequence of Arrow Vector<T> following the chunking semantics defined in
 * the supplied `options` argument.
 *
 * This function returns a function that accepts an `AsyncIterable` of values to
 * transform. When called, this function returns an AsyncIterator of `Vector<T>`.
 *
 * The resulting `AsyncIterator<Vector<T>>` yields Vectors based on the
 * `queueingStrategy` and `highWaterMark` specified in the `options` argument.
 *
 * * If `queueingStrategy` is `"count"` (or omitted), The `AsyncIterator<Vector<T>>`
 *   will flush the underlying `Builder` (and yield a new `Vector<T>`) once the
 *   Builder's `length` reaches or exceeds the supplied `highWaterMark`.
 * * If `queueingStrategy` is `"bytes"`, the `AsyncIterator<Vector<T>>` will flush
 *   the underlying `Builder` (and yield a new `Vector<T>`) once its `byteLength`
 *   reaches or exceeds the supplied `highWaterMark`.
 *
 * @param {IterableBuilderOptions<T, TNull>} options An object of properties which determine the `Builder` to create and the chunking semantics to use.
 * @returns A function which accepts a JavaScript `AsyncIterable` of values
 *          to write, and returns an `AsyncIterator` that yields Vectors
 *          according to the chunking semantics defined in the `options`
 *          argument.
 * @nocollapse
 */
function builderThroughAsyncIterable(options) {
    const { ['queueingStrategy']: queueingStrategy = 'count' } = options;
    const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? Number.POSITIVE_INFINITY : Math.pow(2, 14) } = options;
    const sizeProperty = queueingStrategy !== 'bytes' ? 'length' : 'byteLength';
    return function (source) {
        return tslib_1.__asyncGenerator(this, arguments, function* () {
            var e_1, _a;
            let numChunks = 0;
            const builder = makeBuilder(options);
            try {
                for (var source_1 = tslib_1.__asyncValues(source), source_1_1; source_1_1 = yield tslib_1.__await(source_1.next()), !source_1_1.done;) {
                    const value = source_1_1.value;
                    if (builder.append(value)[sizeProperty] >= highWaterMark) {
                        ++numChunks && (yield yield tslib_1.__await(builder.toVector()));
                    }
                }
            }
            catch (e_1_1) { e_1 = { error: e_1_1 }; }
            finally {
                try {
                    if (source_1_1 && !source_1_1.done && (_a = source_1.return)) yield tslib_1.__await(_a.call(source_1));
                }
                finally { if (e_1) throw e_1.error; }
            }
            if (builder.finish().length > 0 || numChunks === 0) {
                yield yield tslib_1.__await(builder.toVector());
            }
        });
    };
}
exports.builderThroughAsyncIterable = builderThroughAsyncIterable;//# sourceMappingURL=factories.js.map

fix is in factories.js

srschandan commented 11 months ago

I wonder if the fix could be as simple as ignoring the type.id when comparing 2 dictionaries https://github.com/apache/arrow/blob/87a1852d3578ecdf23e776e65bf30f0ee1f2362f/js/src/visitor/typecomparator.ts#L197. If two dicts are same in every other way (other than type.id which is NEVER the same), aren't they technically of the same type of columns? Or sub-columns? In my case, I can convert nested JSON objects as complex as below to Arrow, when I comment it out:

describe('tableFromNestedJSON', () => {
    test('created a table from a nested json', () => {
        const table = tableFromJSON([
            {
                a: 42,
                c: 'foo',
                d: [
                    {
                        da: 'abc',
                        dc: null,
                        dd: [1, 23, 3],
                        de: [
                            {
                                dda: 'a',
                            }
                        ],
                    },
                    {
                        da: null,
                        dc: 45,
                        dd: [51, 3],
                        de: [
                            {
                                dda: 'b',
                            }
                        ],
                    },
                ],
            },
            {
                a: 12,
                c: 'bar',
                d: [
                    {
                        da: 'def',
                        dc: null,
                        dd: [],
                        de: [
                            {
                                dda: 'e',
                            }
                        ],
                    },
                    {
                        da: 'xyz',
                        dc: 76,
                        dd: [1, 2, 3],
                        de: [
                            {
                                dda: null,
                            }
                        ],
                    },
                ],
            },
        ]);
        expect(table.numRows).toBe(2);
        expect(table.numCols).toBe(3);
        expect(table.getChild('a')!.type).toBeInstanceOf(Float64);
        expect(table.getChild('c')!.type).toBeInstanceOf(Dictionary);
        expect(table.getChild('d')!.type).toBeInstanceOf(List);
        const list = table.getChild('d')!;
        expect(list.getChildAt(0)!.type).toBeInstanceOf(Struct);
        const struct = list.getChildAt(0)!;
        expect(struct.getChild('da')!.type).toBeInstanceOf(Dictionary);
        expect(struct.getChild('dd')!.type).toBeInstanceOf(List);
        expect(struct.getChild('dc')!.type).toBeInstanceOf(Float64);
    });
});

However, I'm not sure if this still causes the same type of failure mentioned by [MannySchneck] (https://github.com/MannySchneck) in https://github.com/apache/arrow/pull/14554#issuecomment-1298946553 since I couldn't reproduce it myself.

bmschmidt commented 7 months ago

The example at the top here is

arrow.tableFromJSON([{ a: [{ b: "hi" }] }])

but just noting there's an even simpler failure case:

arrow.tableFromJSON([{ a: ["hi"] }])

also throws this error.

I might dig in a little more if I have some time soon.

matheuzinoficial commented 3 months ago

Any updates on this? I'm having the same problem and I'm stuck in it!

hemkumarrao-ni commented 4 weeks ago

I am facing the same issue too. Any updates on this? Or Can you guys suggest a workaround for the issue?