filecoin-project / specs-actors

DEPRECATED Specification of builtin actors, in the form of executable code.
Other
86 stars 102 forks source link

batch verify seals syscall implemented incorrectly #1472

Open hunjixin opened 3 years ago

hunjixin commented 3 years ago

This error occurs very occasionally, does anyone have an idea? i have confirm BatchVerifySeals argument and return has the same length

ZenGround0 commented 3 years ago

@hunjixin please add information about this error

hunjixin commented 3 years ago

@ZenGround0 no any more info, just root unmatch, and have a error message {"errorMessage": "batch verify seals syscall implemented incorrectly", "exitCode": "17", "sender": "f03", "receiver": "f04", "methodNum": "5", "Value": "0", "gasLimit": 100000000000000}

hunjixin commented 3 years ago

in the code. i have check BatchVerifySeals , this func argument and return has the same length. in the code if miner contain the element , and the verifies must have too.

func (a Actor) processBatchProofVerifies(rt Runtime) {
    var st State

    var miners []addr.Address
    verifies := make(map[addr.Address][]proof.SealVerifyInfo)

    rt.StateTransaction(&st, func() {
        store := adt.AsStore(rt)
        if st.ProofValidationBatch == nil {
            return
        }
        mmap, err := adt.AsMultimap(store, *st.ProofValidationBatch, builtin.DefaultHamtBitwidth, ProofValidationBatchAmtBitwidth)
        builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to load proofs validation batch")

        claims, err := adt.AsMap(adt.AsStore(rt), st.Claims, builtin.DefaultHamtBitwidth)
        builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to load claims")

        err = mmap.ForAll(func(k string, arr *adt.Array) error {
            a, err := addr.NewFromBytes([]byte(k))
            builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to parse address key")

            // refuse to process proofs for miner with no claim
            found, err := claims.Has(abi.AddrKey(a))
            builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to look up claim")
            if !found {
                rt.Log(rtt.WARN, "skipping batch verifies for unknown miner %s", a)
                return nil
            }

            miners = append(miners, a)

            var infos []proof.SealVerifyInfo
            var svi proof.SealVerifyInfo
            err = arr.ForEach(&svi, func(i int64) error {
                infos = append(infos, svi)
                return nil
            })
            builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to iterate over proof verify array for miner %s", a)

            verifies[a] = infos
            return nil
        })
        builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to iterate proof batch")

        st.ProofValidationBatch = nil
    })

    res, err := rt.BatchVerifySeals(verifies)
    builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to batch verify")

    for _, m := range miners {
        vres, ok := res[m]
        if !ok {
            rt.Abortf(exitcode.ErrNotFound, "batch verify seals syscall implemented incorrectly")
        }

        verifs := verifies[m]

        seen := map[abi.SectorNumber]struct{}{}
        var successful []abi.SectorNumber
        for i, r := range vres {
            if r {
                snum := verifs[i].SectorID.Number

                if _, exists := seen[snum]; exists {
                    // filter-out duplicates
                    continue
                }

                seen[snum] = struct{}{}
                successful = append(successful, snum)
            }
        }

        if len(successful) > 0 {
            // The exit code is explicitly ignored
            _ = rt.Send(
                m,
                builtin.MethodsMiner.ConfirmSectorProofsValid,
                &builtin.ConfirmSectorProofsParams{Sectors: successful},
                abi.NewTokenAmount(0),
                &builtin.Discard{},
            )
        }
    }
}
hunjixin commented 3 years ago

@ZenGround0 any idea?

ZenGround0 commented 3 years ago

@hunjixin it looks like your syscall implementation is not returning a map with all miners as keys. This is required even if the miner failed all sectors.

hunjixin commented 3 years ago

@ZenGround0 i have add log in in BatchVerifySeals, there but just compaire the length of arguments and returns. they have the same length. cold nearly the same as lotus. very little chance to meet this problems, there are a error in latest two month


func (sys syscalls) BatchVerifySeals(vis map[address.Address][]proof5.SealVerifyInfo) (map[address.Address][]bool, error) {
    out := make(map[address.Address][]bool)

    sema := make(chan struct{}, BatchSealVerifyParallelism)
    vmlog.Info("BatchVerifySeals miners:", len(vis))
    var wg sync.WaitGroup
    for addr, seals := range vis {
        results := make([]bool, len(seals))
        out[addr] = results

        for i, s := range seals {
            wg.Add(1)
            go func(ma address.Address, ix int, svi proof5.SealVerifyInfo, res []bool) {
                defer wg.Done()
                sema <- struct{}{}

                if err := sys.VerifySeal(svi); err != nil {
                    vmlog.Warnw("seal verify in batch failed", "miner", ma, "index", ix, "err", err)
                    res[ix] = false
                } else {
                    res[ix] = true
                }

                <-sema
            }(addr, i, s, results)
        }
    }
    wg.Wait()
    vmlog.Info("BatchVerifySeals Result miners:", len(out))
    return out, nil
}
ZenGround0 commented 3 years ago

@hunjixin to confirm it is the venus node that sees the exitcode 17 error which leads to the state root mismatch because it is not seen on mainnet?

ZenGround0 commented 3 years ago

If this is the case you should rerun this at the problem epochs and inspect the data making it to res causing the error.

hunjixin commented 3 years ago

error

yes , root not match. but when set-head back to previous tipset and reprocess the same tipset. results always become ok. not restart process.

hunjixin commented 3 years ago

@ZenGround0

i add log like this

    keyInVerifies := []addr.Address{}
    for key, _ := range verifies {
        keyInVerifies = append(keyInVerifies, key)
    }
    rt.Log(rtt.INFO, "ID: %s, verifies keys before BatchVerifySeals %v", id, keyInVerifies)
    rt.Log(rtt.INFO, "ID: %s, miners keys before BatchVerifySeals %v", id, miners)

    res, err := rt.BatchVerifySeals(verifies)
    builtin.RequireNoErr(rt, err, exitcode.ErrIllegalState, "failed to batch verify")

    keyInRes := []addr.Address{}
    for key, _ := range res {
        keyInRes = append(keyInRes, key)
    }

    rt.Log(rtt.INFO, "ID: %s, return before BatchVerifySeals %v", id, keyInRes)

and got log like this

2021-10-16T18:48:03.634+0800    INFO    vm.actors   vmcontext/runtime_adapter.go:197    ID: 21c48411-0860-4061-82ae-7358eff93ae7, verifies keys before BatchVerifySeals [f01170291 f01317157 f01181168 f01207023 f01169696 f0392813 f0411877 f0226418 f01149485 f01218989 f01227383 f01154295 f065877 f01236627 f01103850 f0156452 f01024569 f01138709 f0107999 f01116666 f0150748 f0160735 f083419 f01271225 f087888 f01288529 f01250983 f01138139 f01247078 f01189202 f0469055 f0127378 f01145144 f01101315 f0151498 f01277031 f054420 f01090983 f01177077 f01319368 f01270285 f0428177 f01312143 f01182223 f01250837 f01043193 f01348517 f01098119 f01125168 f01365744 f01251528 f0442377 f0454186 f0124554 f01123833 f0135066 f01261075 f01031867 f01071719 f0156417 f01353593 f062982 f01263957 f0881687 f01122841 f01272340 f01038625 f02419 f01191029 f01096056 f01193462]
2021-10-16T18:48:03.634+0800    INFO    vm.actors   vmcontext/runtime_adapter.go:197    ID: 21c48411-0860-4061-82ae-7358eff93ae7, miners keys before BatchVerifySeals   [f054420 f01125168 f0442377 f01236627 f0454186 f01317157 f0881687 f01181168 f01365744 f0124554 f01090983 f01247078 f01191029 f087888 f01103850 f01071719 f01177077 f01154295 f01182223 f01288529 f0156452 f0411877 f0226418 f01170291 f01207023 f01122841 f01024569 f01250837 f0150748 f01169696 f01096056 f01043193 f01149485 f01218989 f01348517 f01319368 f01138709 f01123833 f01270285 f0135066 f01251528 f01272340 f01189202 f0160735 f0428177 f0156417 f01038625 f0469055 f01250983 f01353593 f01098119 f0107999 f083419 f062982 f01271225 f01193462 f02419 f0127378 f01145144 f01116666 f01312143 f01227383 f01101315 f0151498 f01277031 f01261075 f01263957 f0392813 f01031867 f065877 f01138139]
2021-10-16T18:48:03.634+0800    INFO    vm.context  vmcontext/syscalls.go:91    BatchVerifySeals miners:71
2021-10-16T18:48:04.029+0800    INFO    vm.context  vmcontext/syscalls.go:115   BatchVerifySeals Result miners:71
2021-10-16T18:48:04.029+0800    INFO    vm.actors   vmcontext/runtime_adapter.go:197    ID: 21c48411-0860-4061-82ae-7358eff93ae7, return before BatchVerifySeals        [f0127378 f01031867 f01181168 f0392813 f0156452 f087888 f01247078 f01189202 f0124554 f01250983 f01090983 f01182223 f0881687 f0150748 f01122841 f01138709 f054420 f01193462 f01170291 f02419 f01024569 f01277031 f01125168 f01263957 f0428177 f01365744 f01123833 f01236627 f01101315 f01319368 f01270285 f01038625 f01096056 f01317157 f0411877 f0226418 f083419 f01145144 f01261075 f01169696 f01218989 f065877 f01103850 f01177077 f01098119 f0135066 f01312143 f01043193 f01272340 f01149485 f0107999 f01138139 f0469055 f0151498 f01116666 f01071719 f0156417 f01353593 f01154295 f01271225 f01348517 f01207023 f01250837 f01191029 f0160735 f01251528 f0454186 f01288529 f01227383 f0442377 f062982]
2021-10-16T18:48:04.029+0800    WARN    vm.context  vmcontext/invocation_context.go:197 Abort during actor execution.   {"errorMessage": "batch verify seals syscall implemented incorrectly", "exitCode": "17", "sender": "f03", "receiver": "f04", "methodNum": "5", "Value": "0", "gasLimit": 100000000000000}
2021-10-16T18:48:04.092+0800    INFO    vm.context  vmcontext/vmcontext.go:333  process cron: 463

before loop this miner keys are the same as res keys.

hunjixin commented 3 years ago

@ZenGround0 in v6 version , got error like this

2021-11-04T19:29:35.417+0800  ERROR  vm.actors  vmcontext/runtime_adapter.go:188  unexpected error processing batch proof verifies: batch verify seals syscall implemented incorrectly, result not found for miner: %!s(PANIC=String method: unknown address protocol). Skipping all verification for epoch 1257299

unknown address protocol ?