lbryio / lbry-sdk

The LBRY SDK for building decentralized, censorship resistant, monetized, digital content apps.
https://lbry.com
MIT License
7.19k stars 483 forks source link

reorg mempool processing crash #3142

Closed kauffj closed 2 years ago

kauffj commented 3 years ago

similar test: test_claim_height_changed in integration tests somewhere new test: make a block with 6 txns, be subscribed to hub with the address of the 6th tx, then drop the last tx from the block, do the reorg if that doesn't repro it, access the 6th tx directly

tzarebczan commented 3 years ago

This was the cause of the issues on Sunday morning

lyoshenka commented 3 years ago

next steps

tzarebczan commented 3 years ago
wallet_server_1  | Traceback (most recent call last):
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 215, in _mempool_loop
wallet_server_1  |     return await self._refresh_hashes(synchronized_event)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 233, in _refresh_hashes
wallet_server_1  |     touched = await self._process_mempool(hashes)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 278, in _process_mempool
wallet_server_1  |     deferred, unspent = await fetch
wallet_server_1  |   File "/usr/lib/python3.7/asyncio/tasks.py", line 533, in _wait_for_one
wallet_server_1  |     return f.result()  # May raise f.exception().
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 326, in _fetch_and_accept
wallet_server_1  |     utxos = await self.api.lookup_utxos(prevouts)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 790, in lookup_utxos
wallet_server_1  |     hashX_pairs = await asyncio.get_event_loop().run_in_executor(self.executor, lookup_hashXs)
wallet_server_1  |   File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
wallet_server_1  |     result = self.fn(*self.args, **self.kwargs)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 769, in lookup_hashXs
wallet_server_1  |     return [lookup_hashX(*prevout) for prevout in prevouts]
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 769, in <listcomp>
wallet_server_1  |     return [lookup_hashX(*prevout) for prevout in prevouts]
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 765, in lookup_hashX
wallet_server_1  |     hash, height = self.fs_tx_hash(tx_num)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 481, in fs_tx_hash
wallet_server_1  |     return self.total_transactions[tx_num], tx_height
wallet_server_1  | IndexError: list index out of range
lyoshenka commented 3 years ago

jack thinks that the address history contains a tx that is reorged out and then accessed, so tx_num is greater than the size of the list. this may happen during reorgs. so maybe a race between applying mempool and something else

lyoshenka commented 3 years ago

gonna be fixed in jack's branch

lyoshenka commented 3 years ago

but needs to be fixed immediately on master too

jackrobison commented 3 years ago

wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:chain reorg detected
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:chain was reorganised replacing 2 blocks at heights 929,777-929,778
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:reorg 2 block hashes
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:read 2 blocks from disk
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:got 2 raw blocks
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:backup block 929778
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:backup block 929777
wallet_server_1  | ERROR:lbry.wallet.server.mempool.MemPool:MEMPOOL DIED
wallet_server_1  | Traceback (most recent call last):
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 215, in _mempool_loop
wallet_server_1  |     return await self._refresh_hashes(synchronized_event)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 233, in _refresh_hashes
wallet_server_1  |     touched = await self._process_mempool(hashes)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 278, in _process_mempool
wallet_server_1  |     deferred, unspent = await fetch
wallet_server_1  |   File "/usr/lib/python3.7/asyncio/tasks.py", line 533, in _wait_for_one
wallet_server_1  |     return f.result()  # May raise f.exception().
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/mempool.py", line 326, in _fetch_and_accept
wallet_server_1  |     utxos = await self.api.lookup_utxos(prevouts)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 790, in lookup_utxos
wallet_server_1  |     hashX_pairs = await asyncio.get_event_loop().run_in_executor(self.executor, lookup_hashXs)
wallet_server_1  |   File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
wallet_server_1  |     result = self.fn(*self.args, **self.kwargs)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 769, in lookup_hashXs
wallet_server_1  |     return [lookup_hashX(*prevout) for prevout in prevouts]
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 769, in <listcomp>
wallet_server_1  |     return [lookup_hashX(*prevout) for prevout in prevouts]
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 765, in lookup_hashX
wallet_server_1  |     hash, height = self.fs_tx_hash(tx_num)
wallet_server_1  |   File "/home/lbry/lbry/wallet/server/leveldb.py", line 481, in fs_tx_hash
wallet_server_1  |     return self.total_transactions[tx_num], tx_height
wallet_server_1  | IndexError: list index out of range
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:backed up to height 929,776
wallet_server_1  | INFO:lbry.wallet.server.history.History:backing up removed 3,125 history entries
wallet_server_1  | INFO:lbry.wallet.server.leveldb.LBRYLevelDB:backup flush #16,956 took 1.6s.  Height 929,776 txs: 38,065,740 (-1,004)
wallet_server_1  | INFO:lbry.wallet.server.block_processor.Prefetcher:catching up to daemon height 929,779 (3 blocks behind)
wallet_server_1  | INFO:lbry.wallet.server.block_processor.LBRYBlockProcessor:done with reorg
wallet_server_1  | INFO:lbry.wallet.server.block_processor.Prefetcher:new block height 929,779 hash 0a8bc74614160738f2137d74a26a7d7098fa82c93ff1ee741e66416e431c2e14```
eukreign commented 3 years ago

I spent quite a bit of time trying to reproduce this issue without success. I made a temporary fix which at least prevents mempool from crashing (#3246)

I suspect the issue is a race condition between mempool syncing and block sync performing a chain reorg at the same time; this is very difficult to reproduce in a controlled environment (but perhaps not impossible given more time).

We are hoping this bug will have a better fix in #3205.

For posterity, this is the scaffolding for the test I had written to attempt to reproduce the issue:

    async def test_mempool_sync_during_reorg_errors(self):
        daemon2 = await self.add_daemon()
        address2 = await daemon2.wallet_manager.default_account.receiving.get_or_create_usable_address()
        sendtxid = await self.blockchain.send_to_address(address2, 2)
        await self.confirm_tx(sendtxid, daemon2.ledger)

        self.assertEqual(self.ledger.headers.height, 207)
        for i in range(7):
            await daemon2.jsonrpc_channel_create(f'@xyz{i}', '0.1', blocking=True)
        self.channel1 = await self.channel_create('@abc1', '0.1', blocking=True, confirm=False)
        self.channel2 = await self.channel_create('@abc2', '0.1', blocking=True, confirm=False)
        self.channel3 = await self.channel_create('@abc3', '0.1', blocking=True, confirm=False)
        self.channel4 = await self.channel_create('@abc4', '0.1', blocking=True, confirm=False)
        self.channel5 = await self.channel_create('@abc5', '0.1', blocking=True, confirm=False)
        #for i in range(7):
        #    await daemon2.jsonrpc_channel_create(f'@def{i}', '0.1', blocking=True)
        self.channel6 = await self.channel_create('@abc6', '0.1', blocking=True)
        self.assertEqual(self.ledger.headers.height, 208)
        await self.generate(1)
        self.assertEqual(self.ledger.headers.height, 209)

        await self.blockchain.invalidate_block((await self.ledger.headers.hash(208)).decode())
        await self.blockchain.clear_mempool()
        await self.blockchain.sendrawtransaction(self.channel1['hex'])
        await self.blockchain.sendrawtransaction(self.channel2['hex'])
        await self.blockchain.sendrawtransaction(self.channel3['hex'])
        await self.blockchain.sendrawtransaction(self.channel4['hex'])
        await self.blockchain.sendrawtransaction(self.channel5['hex'])
        self.blockchain.block_expected -= 2
        await self.generate(3)
lyoshenka commented 3 years ago

@jackrobison can i close this?