dgraph-io / badger

Fast key-value DB in Go.
https://dgraph.io/badger
Apache License 2.0
13.94k stars 1.18k forks source link

[BUG]: db.Backup maybe see the `partial write` in one transaction #2049

Open lsytj0413 opened 8 months ago

lsytj0413 commented 8 months ago

What version of Badger are you using?

v4.2.0

What version of Go are you using?

go version go1.21.0 darwin/amd64

Have you tried reproducing the issue with the latest release?

None

What is the hardware spec (RAM, CPU, OS)?

What steps will reproduce the bug?

Below code will panic:

type backupWriter struct {
    kvs map[string]string
    v   uint64
}

var (
    _ io.Writer = (*backupWriter)(nil)
)

func (w *backupWriter) Write(p []byte) (n int, err error) {
    var v pb.KVList
    err = proto.Unmarshal(p, &v)
    if err != nil {
        return len(p), nil
    }

    for _, kv := range v.Kv {
        w.kvs[string(kv.Key)] = string(kv.Value)
    }

    return len(p), nil
}

func (w *backupWriter) Validate() error {
    for key, value := range w.kvs {
        k := key
        if strings.Count(k, "_") != 1 {
            continue
        }

        k = strings.TrimRight(k, "_key")
        k = strings.Split(k, "#")[1]
        v, err := strconv.Atoi(value)
        if err != nil {
            return err
        }

        kk, err := strconv.Atoi(string(k))
        if err != nil {
            return err
        }

        err = w.validateKeys(kk, v)
        if err != nil {
            fmt.Printf("%v\n", w.allKV())
            return err
        }
    }

    return nil
}

func (w *backupWriter) allKV() string {
    kvs := []string{}
    for key, value := range w.kvs {
        kvs = append(kvs, fmt.Sprintf("%v=%v", key, value))
    }

    data, _ := json.Marshal(kvs)
    return string(data)
}

func (w *backupWriter) validateKeys(i int, j int) error {
    for jj := 1; jj <= j; jj++ {
        key := fmt.Sprintf("%v_%v_key", jj, i)

        vv, ok := w.kvs[key]
        if !ok {
            return fmt.Errorf("cannot found key %v, for %v %v\n", key, i, j)
        }

        if vv != fmt.Sprintf("%v", jj) {
            return fmt.Errorf("value %v didn't match for k %v\n", vv, key)
        }
    }

    return nil
}

func testBadgerBackup() {
    dir, err := os.MkdirTemp("", "badger_test")
    if err != nil {
        panic(err)
    }
    fmt.Printf("dir: %v\n", dir)

    o := badger.DefaultOptions(dir)
    o.Dir = ""
    o.ValueDir = ""
    o.InMemory = true
    o.Logger = nil
    db, err := badger.Open(o)
    if err != nil {
        panic(err)
    }
    defer func() {
        err := db.Close()
        if err != nil {
            panic(err)
        }
    }()

    ctx, cancel := context.WithCancel(context.Background())
    defer cancel()

    // var l sync.RWMutex
    var wg sync.WaitGroup
    wg.Add(10)
    for i := 0; i < 10; i++ {
        go func(i int) {
            defer wg.Done()

            vv := fmt.Sprintf("%v", rand.Int())

            j := 0
            for {
                select {
                case <-ctx.Done():
                    return
                case <-time.After(1 * time.Millisecond):
                }

                j++
                // l.RLock()
                tx := db.NewTransaction(true)
                err := tx.Set([]byte(fmt.Sprintf("%v#%v_key", vv, i)), []byte(fmt.Sprintf("%v", j)))
                if err != nil {
                    panic(err)
                }

                err = tx.Set([]byte(fmt.Sprintf("%v_%v_key", j, i)), []byte(fmt.Sprintf("%v", j)))
                if err != nil {
                    panic(err)
                }

                err = tx.Commit()
                if err != nil {
                    panic(err)
                }
                // l.RUnlock()
            }
        }(i)
    }

    ws := []*backupWriter{}
    for i := 0; i < 10; i++ {
        time.Sleep(time.Millisecond * time.Duration(i*rand.Intn(200)+1))
        // l.Lock()
        w := backupWriter{
            kvs: map[string]string{},
        }
        v, err := db.Backup(&w, 0)
        if err != nil {
            panic(err)
        }
        // l.Unlock()
        w.v = v
        ws = append(ws, &w)
    }
    cancel()

    wg.Wait()

    for _, w := range ws {
        err := w.Validate()
        fmt.Printf("backup %v, kv %v, validate %v\n", w.v, len(w.kvs), err)
        if err != nil {
            panic(err)
        }
    }
}

func main() {
    for i := 0; i < 50; i++ {
        fmt.Printf("start testBadgerBackup %v.......................\n", i)
        testBadgerBackup()
    }
}

It will panic probabilistic:

start testBadgerBackup 0.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test1915576934
backup 4, kv 7, validate <nil>
backup 941, kv 951, validate <nil>
backup 2606, kv 2616, validate <nil>
backup 7377, kv 7387, validate <nil>
backup 11594, kv 11604, validate <nil>
backup 12826, kv 12834, validate <nil>
backup 13217, kv 13227, validate <nil>
backup 18297, kv 18307, validate <nil>
backup 18897, kv 18907, validate <nil>
backup 26360, kv 26370, validate <nil>
start testBadgerBackup 1.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test2851596835
backup 7, kv 14, validate <nil>
backup 1404, kv 1414, validate <nil>
backup 3388, kv 3398, validate <nil>
backup 4631, kv 4641, validate <nil>
backup 10052, kv 10062, validate <nil>
backup 12812, kv 12822, validate <nil>
backup 15095, kv 15105, validate <nil>
backup 19277, kv 19287, validate <nil>
backup 23290, kv 23300, validate <nil>
backup 37387, kv 37397, validate <nil>
start testBadgerBackup 2.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test1585819542
backup 4, kv 8, validate <nil>
backup 946, kv 956, validate <nil>
backup 2555, kv 2565, validate <nil>
backup 7035, kv 7045, validate <nil>
backup 9798, kv 9808, validate <nil>
backup 11894, kv 11904, validate <nil>
backup 15171, kv 15181, validate <nil>
backup 26245, kv 26255, validate <nil>
backup 35281, kv 35291, validate <nil>
backup 47292, kv 47302, validate <nil>
start testBadgerBackup 3.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test980017924
backup 1, kv 2, validate <nil>
backup 1480, kv 1490, validate <nil>
backup 1659, kv 1669, validate <nil>
backup 2525, kv 2535, validate <nil>
backup 3601, kv 3611, validate <nil>
backup 4782, kv 4792, validate <nil>
backup 8462, kv 8472, validate <nil>
backup 19804, kv 19814, validate <nil>
backup 22443, kv 22450, validate <nil>
backup 24836, kv 24845, validate <nil>
start testBadgerBackup 4.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test3176089424
backup 3, kv 6, validate <nil>
backup 292, kv 302, validate <nil>
backup 3567, kv 3577, validate <nil>
backup 6927, kv 6937, validate <nil>
backup 9510, kv 9520, validate <nil>
backup 10756, kv 10766, validate <nil>
backup 15206, kv 15216, validate <nil>
backup 19658, kv 19668, validate <nil>
backup 31317, kv 31327, validate <nil>
backup 42476, kv 42486, validate <nil>
start testBadgerBackup 5.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test3885150962
backup 4, kv 8, validate <nil>
backup 1461, kv 1471, validate <nil>
backup 3923, kv 3933, validate <nil>
backup 4953, kv 4963, validate <nil>
backup 7174, kv 7184, validate <nil>
backup 12662, kv 12672, validate <nil>
backup 19332, kv 19342, validate <nil>
backup 26153, kv 26163, validate <nil>
backup 38206, kv 38216, validate <nil>
backup 42134, kv 42144, validate <nil>
start testBadgerBackup 6.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test450010457
backup 1, kv 2, validate <nil>
backup 561, kv 571, validate <nil>
backup 1818, kv 1828, validate <nil>
backup 2865, kv 2875, validate <nil>
backup 7769, kv 7779, validate <nil>
backup 14750, kv 14760, validate <nil>
backup 23341, kv 23351, validate <nil>
backup 28885, kv 28895, validate <nil>
backup 31406, kv 31416, validate <nil>
backup 40905, kv 40915, validate <nil>
start testBadgerBackup 7.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test1032548137
backup 0, kv 0, validate <nil>
backup 972, kv 982, validate <nil>
backup 3746, kv 3756, validate <nil>
backup 7321, kv 7331, validate <nil>
backup 11969, kv 11979, validate <nil>
backup 20365, kv 20375, validate <nil>
backup 30548, kv 30558, validate <nil>
backup 33235, kv 33244, validate <nil>
backup 37813, kv 37823, validate <nil>
backup 43277, kv 43286, validate <nil>
start testBadgerBackup 8.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test4139450864
backup 2, kv 4, validate <nil>
backup 384, kv 394, validate <nil>
backup 3309, kv 3319, validate <nil>
backup 8010, kv 8020, validate <nil>
backup 10379, kv 10389, validate <nil>
backup 14109, kv 14119, validate <nil>
backup 21834, kv 21844, validate <nil>
backup 26754, kv 26764, validate <nil>
backup 29789, kv 29799, validate <nil>
backup 36156, kv 36166, validate <nil>
start testBadgerBackup 9.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test1737783656
backup 1, kv 2, validate <nil>
backup 670, kv 680, validate <nil>
backup 3558, kv 3568, validate <nil>
backup 6943, kv 6953, validate <nil>
backup 11369, kv 11379, validate <nil>
backup 15086, kv 15096, validate <nil>
backup 18627, kv 18637, validate <nil>
backup 22028, kv 22038, validate <nil>
backup 34868, kv 34878, validate <nil>
backup 36728, kv 36738, validate <nil>
start testBadgerBackup 10.......................
dir: /var/folders/n_/qhczpf412h3ghl93j3z6zzx00000gp/T/badger_test3894248576
["1_4_key=1","1_5_key=1","1_6_key=1","237070578879453429#4_key=1","3127585945138148385#6_key=1","5142114080917786158#5_key=1","8919094391511790126#3_key=1"]
backup 4, kv 7, validate cannot found key 1_3_key, for 3 1

panic: cannot found key 1_3_key, for 3 1

In the application, it write two keys in one transaction,for example:

If the lock is enabled in backup and transaction, panic never happened.

Expected behavior and actual result.

It never panic,when db.Backup is called,it will behavior as Snapshot Read.

Additional information

No response

github-actions[bot] commented 3 months ago

This issue has been stale for 60 days and will be closed automatically in 7 days. Comment to keep it open.

lsytj0413 commented 3 months ago

/open