minio / simdjson-go

Golang port of simdjson: parsing gigabytes of JSON per second
Apache License 2.0
1.8k stars 85 forks source link

object.ForEach() - object: unexpected name tag } #65

Closed flowchartsman closed 1 year ago

flowchartsman commented 2 years ago

There appears to be a bug in objectForEach where the iterator over-reads the tape looking for a name or doesn't check for the object ending, maybe. Given the following example, which is an attempt to do array-transparent field iteration:

package main

import (
    "fmt"
    "log"

    "github.com/minio/simdjson-go"
)

func main() {
    // Parse JSON:
    pj, err := simdjson.Parse([]byte(`{
    "items": [
        {
            "name": "jim",
            "scores": [
                {
                    "game":"golf",
                    "scores": ["one","two"]
                },
                {
                    "game":"badminton",
                    "scores":["zero",1,"six"]
                },
                [
                    {
                      "game": "nested for some reason?",
                      "scores": ["five"]
                    }
                ]
            ]
        }
    ]
}`), nil)
    if err != nil {
        log.Fatal(err)
    }

    // Iterate each top level element.
    traverseKeys := map[string]struct{}{"items": {}, "poop": {}}
    var parse func(i simdjson.Iter) error
    parse = func(i simdjson.Iter) error {
        switch i.Type() {
        case simdjson.TypeArray:
            array, err := i.Array(nil)
            if err != nil {
                return err
            }
            array.ForEach(func(i simdjson.Iter) {
                parse(i)
            })
        case simdjson.TypeObject:
            obj, err := i.Object(nil)
            if err != nil {
                return err
            }
            scores := obj.FindKey("scores", nil)
            if scores == nil || scores.Type != simdjson.TypeArray {
                return obj.ForEach(func(_ []byte, i simdjson.Iter) {
                    parse(i)
                }, traverseKeys)
            }
            array, err := scores.Iter.Array(nil)
            if err != nil {
                return err
            }
            array.ForEach(func(i simdjson.Iter) {
                switch i.Type() {
                case simdjson.TypeString, simdjson.TypeInt:
                    s, _ := i.StringCvt()
                    fmt.Println("Found score:", s)
                case simdjson.TypeObject, simdjson.TypeArray:
                    parse(i)
                }
            })
        default:
            fmt.Println("Ignoring", i.Type())
        }

        return nil
    }
    if err := pj.ForEach(parse); err != nil {
        log.Fatal(err)
    }

    // Output:
    //Found score: one
    //Found score: two
    //Found score: zero
    //Found score: 1
    //Found score: six
    //Found score: five
    // 2022/03/18 00:44:08 object: unexpected name tag }
}

The problem appears to be here in parsed_object.go wherein the advance fails if not finding a string, but not making an allowance for the end of the object, while the actual check for the end of the object occurs later, here. If I'm reading this right, it would throw an error on the (valid) JSON of an empty object: {}.

Maybe something like this?

func (o *Object) ForEach(fn func(key []byte, i Iter), onlyKeys map[string]struct{}) error {
    tmp := o.tape.Iter()
    tmp.off = o.off
    n := 0
    for {
        typ := tmp.Advance()

        // Check for end of object
        if typ == TypeNone {
            if tmp.off == len(tmp.tape.Tape) {
                return nil
            }
            return fmt.Errorf("object: unexpected tag: %v", tmp.t)
        }

        // We want name and at least one value.
        if typ != TypeString {
            return fmt.Errorf("object: expecting object name string, found: %v", tmp.t)
        }
        // if the next item is the end of the tape or beyond it, then something
        // is wrong
        if tmp.off+1 >= len(tmp.tape.Tape) {
            return fmt.Errorf("unexpected end of object tape, expecting value")
        }
        // Get the name(object key)
        offset := tmp.cur
        length := tmp.tape.Tape[tmp.off]
        name, err := tmp.tape.stringByteAt(offset, length)
        if err != nil {
            return fmt.Errorf("getting object name: %w", err)
        }

        // if the user has specified that only certain keys be iterated, make
        // sure this one is among them
        if len(onlyKeys) > 0 {
            if _, ok := onlyKeys[string(name)]; !ok {
                // otherwise skip the value
                tmp.Advance()
                continue
            }
        }

        t := tmp.Advance()
        if t == TypeNone {
            return nil
        }
        fn(name, tmp)
        n++
        if n == len(onlyKeys) {
            return nil
        }
    }
}
klauspost commented 1 year ago

Fixed in #72 (now merged)

https://github.com/minio/simdjson-go/pull/72/files#diff-c84156d5513c93b664627a06a60f8616e138352819d963a248328163673bd5d0R168

sdressler commented 1 year ago

I see the very same error message again with v0.4.2 and parsing {}. Is my code wrong?

package main

import (
    "fmt"

    "github.com/minio/simdjson-go"
)

var raw = []byte(`{}`)

func main() {
    json, err := simdjson.Parse(raw, nil)
    if err != nil {
        panic(err)
    }

    err = json.ForEach(func(it simdjson.Iter) error {
        obj, _ := it.Object(nil)
        return obj.ForEach(func(k []byte, _ simdjson.Iter) {
            fmt.Println(string(k))
        }, nil)
    })
    if err != nil {
        panic(err)
    }
}

Output:

panic: object: unexpected name tag }

goroutine 1 [running]:
main.main()
    /tmp/sandbox90905901/prog.go:24 +0x68

Program exited.
klauspost commented 1 year ago

@sdressler

It works on master. I will see if we can make a release.

    // Parse JSON:
    pj, err := Parse([]byte(`{}`), nil)
    if err != nil {
        log.Fatal(err)
    }

    // Create an element we can reuse.
    err = pj.ForEach(func(i Iter) error {
        fmt.Println("Got iterator for type:", i.Type())
        obj, err := i.Object(nil)
        if err == nil {
            return obj.ForEach(func(k []byte, _ Iter) {
                fmt.Println(string(k))
            }, nil)
        }
        return nil
    })

Outputs Got iterator for type: object.

Released v0.4.3

sdressler commented 1 year ago

Thanks! I failed to connect the dots between master & 0.4.2 release.