Closed stephanep closed 1 year ago
Actually there is another attempt a minute later, after the node successfully connected to some discovered peers. But it fails too.
TRC 2023-08-08 23:44:24.196+02:00 updated scores topics="libp2p gossipsub" peers=14
TRC 2023-08-08 23:44:24.196+02:00 Attempting to dial a direct peer topics="libp2p gossipsub" peer=16U*FvNqHb
TRC 2023-08-08 23:44:24.196+02:00 connection not found topics="libp2p connmanager" peerId=16U*FvNqHb
TRC 2023-08-08 23:44:24.196+02:00 Acquired slot topics="libp2p semaphore" available=124 queue=0
DBG 2023-08-08 23:44:24.196+02:00 Dialing peer topics="libp2p dialer" peerId=ok(16Uiu2HAkwqd8kmmdqLjteD3TyGPz9cVpsZCuW1h8ioBr94FvNqHb)
TRC 2023-08-08 23:44:24.196+02:00 Releasing slot topics="libp2p semaphore" available=124 queue=0
TRC 2023-08-08 23:44:24.196+02:00 Released slot topics="libp2p semaphore" available=125 queue=0
DBG 2023-08-08 23:44:24.196+02:00 Direct peer error dialing topics="libp2p gossipsub" msg="Unable to establish outgoing link"
Could it be a problem with the direct-peer address syntax (i'm using the /ip4/<address>/tcp/<port>/p2p/<peerId-public-key>
syntax as mentioned in the documentation)?
Will try concatenating the node public keys after the id as suggested, when i find a way to get those. EDIT: found the "network_public_key" in the networking topics logs, but no luck either, the format is not accepted (and i don't see why it would need both - peer id and pubkey). EDIT2: attaching the logs of the test vm trace.txt
Some further considerations:
So i made an experiment by making the following change (stripping the p2p peer-id part of the multiaddress before it is stored and passed to libp2p):
diff --git a/beacon_chain/networking/eth2_network.nim b/beacon_chain/networking/eth2_network.nim
index dc3230b1..44e820f1 100644
--- a/beacon_chain/networking/eth2_network.nim
+++ b/beacon_chain/networking/eth2_network.nim
@@ -2381,8 +2381,9 @@ proc createEth2Node*(rng: ref HmacDrbgContext,
maddress = MultiAddress.init(s).tryGet()
mpeerId = maddress[multiCodec("p2p")].tryGet()
peerId = PeerId.init(mpeerId.protoAddress().tryGet()).tryGet()
- res.mgetOrPut(peerId, @[]).add(maddress)
- info "Adding priviledged direct peer", peerId, address = maddress
+ l4maddress = parseFullAddress(maddress).get()[1]
+ res.mgetOrPut(peerId, @[]).add(l4maddress)
+ info "Adding priviledged direct peer", peerId, address = maddress, l4address = l4maddress
res
)
pubsub = GossipSub.init(
This worked as well:
diff --git a/libp2p/protocols/pubsub/gossipsub.nim b/libp2p/protocols/pubsub/gossipsub.nim
index 75fc48d9a..b25901ed6 100644
--- a/libp2p/protocols/pubsub/gossipsub.nim
+++ b/libp2p/protocols/pubsub/gossipsub.nim
@@ -575,7 +575,8 @@ proc maintainDirectPeer(g: GossipSub, id: PeerId, addrs: seq[MultiAddress]) {.as
if isNil(peer):
trace "Attempting to dial a direct peer", peer = id
try:
- await g.switch.connect(id, addrs)
+ let connected_id = await g.switch.connect(addrs[0])
+ doAssert connected_id == id, "connected peer id does not match address"
# populate the peer after it's connected
discard g.getOrCreatePeer(id, g.codecs)
except CancelledError as exc:
With any of those changes, i was able to successfully establish the peering session. I could also verify that, if the peer does not match the specified peer-id, the session will be dropped.
I don't know the project well enough (if at all) to know if this is the right thing to do, so i will let you guys please have a look. But at least this confirms my yesterday's thoughts on the cause.
Have been getting good results with that one on my setup over the past few days, the session between my two nodes remained up and stable no matter other peers activity, nodes restart, ...
But it would be nice if someone with better knowledge of the project can check if and why i actually need this. Feels like direct peer never actually worked which i find surprising.
# nimbus-eth2 repo
diff --git a/beacon_chain/networking/eth2_network.nim b/beacon_chain/networking/eth2_network.nim
index dc3230b1..ff615dea 100644
--- a/beacon_chain/networking/eth2_network.nim
+++ b/beacon_chain/networking/eth2_network.nim
@@ -1508,11 +1508,12 @@ proc trimConnections(node: Eth2Node, count: int) =
var toKick = count
for peerId in scores.keys:
- debug "kicking peer", peerId, score=scores[peerId]
- asyncSpawn node.getPeer(peerId).disconnect(PeerScoreLow)
- dec toKick
- inc(nbc_cycling_kicked_peers)
- if toKick <= 0: return
+ if peerId notin node.pubsub.parameters.directPeers:
+ debug "kicking peer", peerId, score=scores[peerId]
+ asyncSpawn node.getPeer(peerId).disconnect(PeerScoreLow)
+ dec toKick
+ inc(nbc_cycling_kicked_peers)
+ if toKick <= 0: return
proc getLowSubnets(node: Eth2Node, epoch: Epoch): (AttnetBits, SyncnetBits) =
# Returns the subnets required to have a healthy mesh
@@ -2381,8 +2382,9 @@ proc createEth2Node*(rng: ref HmacDrbgContext,
maddress = MultiAddress.init(s).tryGet()
mpeerId = maddress[multiCodec("p2p")].tryGet()
peerId = PeerId.init(mpeerId.protoAddress().tryGet()).tryGet()
- res.mgetOrPut(peerId, @[]).add(maddress)
- info "Adding priviledged direct peer", peerId, address = maddress
+ l4maddress = parseFullAddress(maddress).get()[1]
+ res.mgetOrPut(peerId, @[]).add(l4maddress)
+ info "Adding priviledged direct peer", peerId, address = l4maddress
res
)
pubsub = GossipSub.init(
# nim-libp2p repo
diff --git a/libp2p/protocols/pubsub/gossipsub.nim b/libp2p/protocols/pubsub/gossipsub.nim
index 75fc48d9a..0f564da76 100644
--- a/libp2p/protocols/pubsub/gossipsub.nim
+++ b/libp2p/protocols/pubsub/gossipsub.nim
@@ -575,7 +575,7 @@ proc maintainDirectPeer(g: GossipSub, id: PeerId, addrs: seq[MultiAddress]) {.as
if isNil(peer):
trace "Attempting to dial a direct peer", peer = id
try:
- await g.switch.connect(id, addrs)
+ await g.switch.connect(id, addrs, forceDial = true)
# populate the peer after it's connected
discard g.getOrCreatePeer(id, g.codecs)
except CancelledError as exc:
cc @Menduist
Your fixes seem reasonable (except the forceDial
, that shouldn't be necessary), our CI currently doesn't test this feature and it wasn't tested end to end for a while unfortunately.
I'll open a PR to fix and try to add to our CI
EDIT: forceDial was also necessary, mb
Fixed in #5427, thanks a lot @stephanep!
Describe the bug When starting nimbus_beacon_node with a --direct-peer specification, it fails to connect to that peer at start time and never reattempts. The following event show up in the debug logs: Direct peer error dialing topics="libp2p gossipsub" msg="Unable to establish outgoing link". There is no further attempt made, and the direct peer i'm trying to connect to never shows up in the list of peers of the node.
To Reproduce Steps to reproduce the behavior:
Platform details (OS, architecture):
Branch/commit used: Latest stable version to date.
eth2 specification v1.4.0-beta.0
Nim Compiler Version 1.6.14 [Linux: amd64]
nimbus@v-debian:~$ ./nimbus_beacon_node --direct-peer=/ip4/207.96.125.164/tcp/9000/p2p/16Uiu2HAkwqd8kmmdqLjteD3TyGPz9cVpsZCuW1h8ioBr94FvNqHb --log-level=TRACE > trace.out
DBG 2023-08-08 23:43:24.146+02:00 starting switch for peer topics="libp2p switch" peerInfo="(peerId: \"16Uiu2HAmGJnUgBoucvgCXcXTT3V1VLqTAE2DNva93c9XKXkiAtyo\", listenAddrs: @[\"/ip4/0.0.0.0/tcp/9000\"], addrs: @[], protocols: @[\"/ipfs/id/1.0.0\", \"/meshsub/1.1.0\", \"/eth2/beacon_chain/req/status/1/ssz_snappy\", \"/eth2/beacon_chain/req/ping/1/ssz_snappy\", \"/eth2/beacon_chain/req/metadata/1/ssz_snappy\", \"/eth2/beacon_chain/req/metadata/2/ssz_snappy\", \"/eth2/beacon_chain/req/beacon_blocks_by_range/2/ssz_snappy\", \"/eth2/beacon_chain/req/beacon_blocks_by_root/2/ssz_snappy\", \"/eth2/beacon_chain/req/blob_sidecars_by_root/1/ssz_snappy\", \"/eth2/beacon_chain/req/blob_sidecars_by_range/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_bootstrap/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_updates_by_range/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_finality_update/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_optimistic_update/1/ssz_snappy\", \"/eth2/beacon_chain/req/goodbye/1/ssz_snappy\"], protoVersion: \"ipfs/0.1.0\", agentVersion: \"nimbus\")" TRC 2023-08-08 23:43:24.146+02:00 starting transport on addrs topics="libp2p transport" address=@[/ip4/0.0.0.0/tcp/9000] TRC 2023-08-08 23:43:24.146+02:00 Starting TCP transport topics="libp2p tcptransport" TRC 2023-08-08 23:43:24.146+02:00 Listening on topics="libp2p tcptransport" address=/ip4/0.0.0.0/tcp/9000 ... TRC 2023-08-08 23:43:24.147+02:00 Attempting to dial a direct peer topics="libp2p gossipsub" peer=16UFvNqHb TRC 2023-08-08 23:43:24.147+02:00 connection not found topics="libp2p connmanager" peerId=16UFvNqHb TRC 2023-08-08 23:43:24.147+02:00 Acquired slot topics="libp2p semaphore" available=158 queue=0 DBG 2023-08-08 23:43:24.147+02:00 Dialing peer topics="libp2p dialer" peerId=ok(16Uiu2HAkwqd8kmmdqLjteD3TyGPz9cVpsZCuW1h8ioBr94FvNqHb) TRC 2023-08-08 23:43:24.147+02:00 Releasing slot topics="libp2p semaphore" available=158 queue=0 TRC 2023-08-08 23:43:24.147+02:00 Released slot topics="libp2p semaphore" available=159 queue=0 DBG 2023-08-08 23:43:24.147+02:00 Direct peer error dialing topics="libp2p gossipsub" msg="Unable to establish outgoing link" DBG 2023-08-08 23:43:24.147+02:00 Started libp2p node topics="libp2p switch" peer="(peerId: \"16Uiu2HAmGJnUgBoucvgCXcXTT3V1VLqTAE2DNva93c9XKXkiAtyo\", listenAddrs: @[\"/ip4/0.0.0.0/tcp/9000\"], addrs: @[\"/ip4/0.0.0.0/tcp/9000\"], protocols: @[\"/ipfs/id/1.0.0\", \"/meshsub/1.1.0\", \"/eth2/beacon_chain/req/status/1/ssz_snappy\", \"/eth2/beacon_chain/req/ping/1/ssz_snappy\", \"/eth2/beacon_chain/req/metadata/1/ssz_snappy\", \"/eth2/beacon_chain/req/metadata/2/ssz_snappy\", \"/eth2/beacon_chain/req/beacon_blocks_by_range/2/ssz_snappy\", \"/eth2/beacon_chain/req/beacon_blocks_by_root/2/ssz_snappy\", \"/eth2/beacon_chain/req/blob_sidecars_by_root/1/ssz_snappy\", \"/eth2/beacon_chain/req/blob_sidecars_by_range/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_bootstrap/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_updates_by_range/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_finality_update/1/ssz_snappy\", \"/eth2/beacon_chain/req/light_client_optimistic_update/1/ssz_snappy\", \"/eth2/beacon_chain/req/goodbye/1/ssz_snappy\"], protoVersion: \"ipfs/0.1.0\", agentVersion: \"nimbus\")"