erigontech / erigon

Ethereum implementation on the efficiency frontier https://erigon.gitbook.io
GNU Lesser General Public License v3.0
3.15k stars 1.13k forks source link

can't rename tmp file after creation #12751

Open AskAlexSharov opened 1 week ago

AskAlexSharov commented 1 week ago

mainnet:

[EROR] [11-18|07:45:10.696] [Antiquary] Failed to antiquate blobs    err="compress: renaming: rename /erigon-data/snapshots/v1-010380-010390-blobsidecars.seg.tmp /erigon-data/snapshots/v1-010380-010390-blobsidecars.seg: no such file or directory"
ls /erigon-data/snapshots/v1-0103*-blobsidecars.*
/erigon-data/snapshots/v1-010300-010310-blobsidecars.seg          /erigon-data/snapshots/v1-010320-010330-blobsidecars.seg  /erigon-data/snapshots/v1-010360-010370-blobsidecars.seg
/erigon-data/snapshots/v1-010300-010310-blobsidecars.seg.torrent  /erigon-data/snapshots/v1-010330-010340-blobsidecars.seg  /erigon-data/snapshots/v1-010370-010380-blobsidecars.seg
/erigon-data/snapshots/v1-010310-010320-blobsidecars.seg          /erigon-data/snapshots/v1-010340-010350-blobsidecars.seg
/erigon-data/snapshots/v1-010310-010320-blobsidecars.seg.torrent  /erigon-data/snapshots/v1-010350-010360-blobsidecars.seg

more logs:

[INFO] [11-18|07:40:54.140] [4/6 Execution][agg] computing trie      progress=77.93k/1.20M alloc=14.0GB sys=32.6GB
[INFO] [11-18|07:41:14.154] [4/6 Execution][agg] computing trie      progress=137.74k/1.20M alloc=17.6GB sys=32.6GB
[INFO] [11-18|07:41:23.371] State processing progress                slot=10413866 blk/sec=19.27
[INFO] [11-18|07:41:28.045] P2P                                      app=caplin peers=88
[INFO] [11-18|07:41:34.140] [4/6 Execution][agg] computing trie      progress=211.12k/1.20M alloc=11.3GB sys=32.6GB
[INFO] [11-18|07:41:54.145] [4/6 Execution][agg] computing trie      progress=319.12k/1.20M alloc=15.8GB sys=32.6GB
[INFO] [11-18|07:42:14.139] [4/6 Execution][agg] computing trie      progress=403.80k/1.20M alloc=19.6GB sys=32.6GB
[INFO] [11-18|07:42:18.893] [p2p] GoodPeers                          eth68=49 eth67=11
[INFO] [11-18|07:42:20.026] [mem] memory stats                       Rss=83.9GB Size=0B Pss=83.9GB SharedClean=4.7MB SharedDirty=0B PrivateClean=56.2GB PrivateDirty=27.7GB Referenced=81.4GB Anonymous=27.7GB Swap=0B alloc=20.4GB sys=32.6GB
[INFO] [11-18|07:42:23.394] State processing progress                slot=10414991 blk/sec=18.75
[INFO] [11-18|07:42:28.051] P2P                                      app=caplin peers=92
[INFO] [11-18|07:42:34.140] [4/6 Execution][agg] computing trie      progress=502.24k/1.20M alloc=13.2GB sys=32.6GB
[INFO] [11-18|07:42:54.145] [4/6 Execution][agg] computing trie      progress=631.49k/1.20M alloc=17.5GB sys=32.6GB
[INFO] [11-18|07:43:12.491] [snapshots:merge] Stat                   blocks=21.20M indices=21.20M alloc=21.4GB sys=32.6GB
[INFO] [11-18|07:43:12.884] BuildFilesInBackground                   step=1679 lastInDB=1680
[INFO] [11-18|07:43:14.141] [4/6 Execution][agg] computing trie      progress=762.27k/1.20M alloc=14.4GB sys=32.6GB
[INFO] [11-18|07:43:23.369] State processing progress                slot=10416188 blk/sec=19.95
[INFO] [11-18|07:43:28.045] P2P                                      app=caplin peers=93
[INFO] [11-18|07:43:34.140] [4/6 Execution][agg] computing trie      progress=833.84k/1.20M alloc=13.1GB sys=32.6GB
[INFO] [11-18|07:43:54.139] [4/6 Execution][agg] computing trie      progress=920.80k/1.20M alloc=18.8GB sys=32.6GB
[INFO] [11-18|07:44:14.141] [4/6 Execution][agg] computing trie      progress=996.97k/1.20M alloc=11.9GB sys=32.6GB
[INFO] [11-18|07:44:24.455] State processing progress                slot=10417142 blk/sec=15.90
[INFO] [11-18|07:44:28.045] P2P                                      app=caplin peers=100
[INFO] [11-18|07:44:34.139] [4/6 Execution][agg] computing trie      progress=1.06M/1.20M alloc=11.9GB sys=32.6GB
[INFO] [11-18|07:44:54.140] [4/6 Execution][agg] computing trie      progress=1.15M/1.20M alloc=13.8GB sys=32.6GB
[INFO] [11-18|07:44:58.563] [snapshots] aggregated                   step=1679 took=1m45.678522456s
[INFO] [11-18|07:44:58.563] [snapshots] merge state accounts(val=1664-1680, hist=1664-1680, idx=1664-1680), storage(val=1664-1680, hist=1664-1680, idx=1664-1680), code(val=1664-1680, hist=1664-1680, idx=1664-1680), commitment(val=1664-1680), receipt(val=1664-1680, hist=1664-1680, idx=1664-1680), logAddr=1664-1680, logTopic=1664-1680, traceFrom=1664-1680, traceTo=1664-1680
[EROR] [11-18|07:45:10.696] [Antiquary] Failed to antiquate blobs    err="compress: renaming: rename /erigon-data/snapshots/v1-010380-010390-blobsidecars.seg.tmp /erigon-data/snapshots/v1-010380-010390-blobsidecars.seg: no such file or directory"
[INFO] [11-18|07:45:10.696] [Antiquary] Antiquating blobs            app=caplin from=10370000 to=10399999
[INFO] [11-18|07:45:18.894] [p2p] GoodPeers                          eth68=51 eth67=9
[INFO] [11-18|07:45:19.221] [mem] memory stats                       Rss=28.0GB Size=0B Pss=28.0GB SharedClean=948.0KB SharedDirty=0B PrivateClean=7.2GB PrivateDirty=20.8GB Referenced=28.0GB Anonymous=20.8GB Swap=0B alloc=15.5GB sys=32.6GB
[INFO] [11-18|07:45:23.363] State processing progress                slot=10417624 blk/sec=8.03
[INFO] [11-18|07:45:28.045] P2P                                      app=caplin peers=102
[INFO] [11-18|07:46:09.192] [4/6 Execution] Done                     blk=21211999 blks=5000 blk/s=4.9 txs=854632 tx/s=837 gas/s=74.03M buf=4.8GB/512.0MB stepsInDB=0.00 step=1680.8 alloc=18.0GB sys=32.6GB
[INFO] [11-18|07:46:09.192] [4/6 Execution] DONE                     in=17m0.761709115s block=21206999
[INFO] [11-18|07:46:16.553] head updated                             age=4h29s head=0x4d22f01b6842ff0427e316ca75c8776f97d4a6759993eff4cbacc01d6a8b5222 hash=0x4d22f01b6842ff0427e316ca75c8776f97d4a6759993eff4cbacc01d6a8b5222 commit=487.70668ms alloc=22.1GB sys=32.6GB
Giulio2002 commented 3 days ago

does restart help?

Giulio2002 commented 3 days ago

is this where? snapshotter?

Giulio2002 commented 3 days ago

also fyi this is not a caplin issue as it happens in:

func (c *Compressor) Compress() error {
    if err := c.uncompressedFile.Flush(); err != nil {
        return err
    }

    logEvery := time.NewTicker(20 * time.Second)
    defer logEvery.Stop()
    if len(c.superstring) > 0 {
        c.superstrings <- c.superstring
    }
    close(c.superstrings)
    runtime.GC()
    c.wg.Wait()
    runtime.GC()

    if c.lvl < log.LvlTrace {
        c.logger.Log(c.lvl, fmt.Sprintf("[%s] BuildDict start", c.logPrefix), "workers", c.Workers)
    }
    db, err := DictionaryBuilderFromCollectors(c.ctx, c.Cfg, c.logPrefix, c.tmpDir, c.suffixCollectors, c.lvl, c.logger)
    if err != nil {
        return err
    }
    if c.trace {
        _, fileName := filepath.Split(c.outputFile)
        if err := PersistDictionary(filepath.Join(c.tmpDir, fileName)+".dictionary.txt", db); err != nil {
            return err
        }
    }
    defer os.Remove(c.tmpOutFilePath)

    cf, err := os.Create(c.tmpOutFilePath)
    if err != nil {
        return err
    }
    defer cf.Close()
    t := time.Now()
    if err := compressWithPatternCandidates(c.ctx, c.trace, c.Cfg, c.logPrefix, c.tmpOutFilePath, cf, c.uncompressedFile, db, c.lvl, c.logger); err != nil {
        return err
    }
    if err = c.fsync(cf); err != nil {
        return err
    }
    if err = cf.Close(); err != nil {
        return err
    }
    if err := os.Rename(c.tmpOutFilePath, c.outputFile); err != nil {
        return fmt.Errorf("renaming: %w", err)
    }

    c.Ratio, err = Ratio(c.uncompressedFile.filePath, c.outputFile)
    if err != nil {
        return fmt.Errorf("ratio: %w", err)
    }

    _, fName := filepath.Split(c.outputFile)
    if c.lvl < log.LvlTrace {
        c.logger.Log(c.lvl, fmt.Sprintf("[%s] Compress", c.logPrefix), "took", time.Since(t), "ratio", c.Ratio, "file", fName)
    }
    return nil
}
Giulio2002 commented 3 days ago

ok - checked the snapshotter - this seems like a rare issue that self-heals luckily. we should not ignore it but i will de-prioritize to imp3

Giulio2002 commented 3 days ago

also how sure are we that this is not some kind of machine config issue? this sounds super odd - need more details