gluxon / wireguard-uapi-rs

MIT License
32 stars 10 forks source link

No ack received #28

Open xtexChooser opened 1 year ago

xtexChooser commented 1 year ago
fn connect_route_socket() -> Result<RouteSocket> {
        Ok(RouteSocket::connect().context("connect to WG route socket")?)
    }

Self::connect_route_socket()?
            .add_device("test_test")
            .context("add new WG device")?;

The device has been added successfully,

9: test_test: <POINTOPOINT> mtu 1420 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/none 

When calling set_device without handling that error, the device could not get updated.

Here is a strace:

socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE) = 3
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
sendto(3, [{nlmsg_len=68, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1, nlmsg_pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_
index=0, ifi_flags=0, ifi_change=0xffffffff}, [[{nla_len=13, nla_type=IFLA_IFNAME}, "test_test"...], [{nla_len=20, nla_type=IFLA_LINKINFO}, [{nla_len=13, nla_type=IFLA_INFO_KIND}, "wireguard"...]]]], 68, 0, NULL, 0) = 68                                                                                                                                                              recvfrom(3, [{nlmsg_len=36, nlmsg_type=NLMSG_ERROR, nlmsg_flags=NLM_F_CAPPED, nlmsg_seq=1, nlmsg_pid=5347}, {error=0, msg={nlmsg_len=68, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NL
M_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1, nlmsg_pid=0}}], 32768, 0, NULL, NULL) = 36                                                                                                     close(3)                                = 0
socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC) = 3
sendto(3, [{nlmsg_len=36, nlmsg_type=0x10 /* NLMSG_??? */, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK, nlmsg_seq=1, nlmsg_pid=0}, "\x03\x02\x00\x00\x0e\x00\x02\x00\x77\x69\x72\x65\x67\x75\x61\x72\
x64\x00\x00\x00"], 36, 0, NULL, 0) = 36                                                                                                                                                      recvfrom(3, [{nlmsg_len=112, nlmsg_type=nlctrl, nlmsg_flags=0, nlmsg_seq=1, nlmsg_pid=5347}, "\x01\x02\x00\x00\x0e\x00\x02\x00\x77\x69\x72\x65\x67\x75\x61\x72\x64\x00\x00\x00\x06\x00\x01\x0
0\x23\x00\x00\x00\x08\x00\x03\x00"...], 32768, 0, NULL, NULL) = 112                                                                                                                          recvfrom(3, [{nlmsg_len=36, nlmsg_type=NLMSG_ERROR, nlmsg_flags=NLM_F_CAPPED, nlmsg_seq=1, nlmsg_pid=5347}, {error=0, msg={nlmsg_len=36, nlmsg_type=nlctrl, nlmsg_flags=NLM_F_REQUEST|NLM_F_A
CK, nlmsg_seq=1, nlmsg_pid=0}}], 32768, 0, NULL, NULL) = 36                                                                                                                                  close(3)                                = 0
socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC) = 3
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
sendto(3, [{nlmsg_len=248, nlmsg_type=wireguard, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK, nlmsg_seq=1, nlmsg_pid=0}, "\x01\x01\x00\x00\x09\x00\x02\x00\x74\x65\x73\x74\x00\x00\x00\x00\x08\x00\x0
5\x00\x01\x00\x00\x00\x24\x00\x03\x00\x50\x52\x53\xa9"...], 248, 0, NULL, 0) = 248                                                                                                           recvfrom(3, [{nlmsg_len=268, nlmsg_type=NLMSG_ERROR, nlmsg_flags=0, nlmsg_seq=1, nlmsg_pid=5347}, {error=-ENODEV, msg=[{nlmsg_len=248, nlmsg_type=wireguard, nlmsg_flags=NLM_F_REQUEST|NLM_F_
ACK, nlmsg_seq=1, nlmsg_pid=0}, "\x01\x01\x00\x00\x09\x00\x02\x00\x74\x65\x73\x74\x00\x00\x00\x00\x08\x00\x05\x00\x01\x00\x00\x00\x24\x00\x03\x00\x00\x00\x00\x00"...]}], 32768, 0, NULL, NULL) = 268                                                                                                                                                                                     close(3)                                = 0

Tested on openSUSE Tumbleweed with kernel 5.19.7-1-default.

xtexChooser commented 1 year ago

Confirmed on Ubuntu 20.04.4 LTS x86_64 with kernel 5.4.0-1089-azure(on GitHub Codespaces).


test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/macos.rs (target/debug/deps/macos-77cc86e26f4d21df)

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/set_and_get.rs (target/debug/deps/set_and_get-0a81b834f134c7a9)

running 4 tests
Error: No ack received
Error: No ack received
Error: No ack received
test large_peer ... FAILED
test set_ifname_has_proper_padding ... FAILED
test peer_update_only ... FAILED
Error: No ack received
test simple ... FAILED

failures:
xtexChooser commented 1 year ago

It seems like because of CAP_NET_ADMIN. However, when I am trying to add it on local computer, it doesnot work.

xtexChooser commented 1 year ago
➜  peerd git:(main) ✗ find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip
➜  peerd git:(main) ✗ cargo r apply
    Finished dev [unoptimized + debuginfo] target(s) in 0.42s
     Running `target/debug/peerd apply`
Applying configs
Reloading peer configs
Reading config from peerd.toml
thread 'main' panicked at 'called `Result::unwrap()` on an `Err` value: add new WG device

Caused by:
    No ack received', src/peer_conf/mod.rs:48:62
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
➜  peerd git:(main) ✗ sudo getcap target/debug/peerd
target/debug/peerd cap_net_admin=eip
gluxon commented 1 year ago

Hey @xtexChooser, what happens on rc/peer_conf/mod.rs:48:62? Is that line adding a new device?

xtexChooser commented 1 year ago

Hey @gluxon At src/peer_conf/mod.rs, it calls the abstract tunnel manager to add the tunnel for a test.

The implementation can be found at https://github.com/xtexChooser/peerd/blob/main/src/tunnel/wireguard/linux.rs.

Thanks.

gluxon commented 1 year ago

Mismatched Paths

It looks like this command applies CAP_NET_ADMIN to all files under ./target/debug/deps

➜  peerd git:(main) ✗ find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip

But the binary ran is ./target/debug/peerd, which isn't under ./target/debug/deps.

➜  peerd git:(main) ✗ cargo r apply
    Finished dev [unoptimized + debuginfo] target(s) in 0.42s
     Running `target/debug/peerd apply`

The find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip line only applies to cargo test binaries and not main binaries in the crate.

getcap output syntax

I did see that your test shows:

➜  peerd git:(main) ✗ sudo getcap target/debug/peerd
target/debug/peerd cap_net_admin=eip

That was helpful. Thanks for posting that. I'm a bit confused because I'm not sure what =eip means though. On my Ubuntu 20.04 VM, I see no output for a file that doesn't have any capabilities set yet.

❯ sudo getcap target/debug/examples/wg

❯

When I apply capabilities, I see +eip.

❯ sudo setcap CAP_NET_ADMIN=+eip ./target/debug/examples/wg

❯ sudo getcap target/debug/examples/wg
target/debug/examples/wg = cap_net_admin+eip
xtexChooser commented 1 year ago

About the mismatched path, I have added the cap to target file.

And, about the output syntax, idk

xtex% sudo setcap CAP_NET_ADMIN=+eip target/debug/peerd
xtex% sudo getcap target/debug/peerd
target/debug/peerd cap_net_admin=eip

If I give the cap, the interface can be added successfully but still No ack received. If I do not, the interface cannot be added, and No ack

xtexChooser commented 1 year ago
➜  Source git clone --depth 1 git@github.com:gluxon/wireguard-uapi-rs.git 
正克隆到 'wireguard-uapi-rs'...
Enter passphrase for key '/home/xtex/.ssh/id_ed25519': 
remote: Enumerating objects: 61, done.
remote: Counting objects: 100% (61/61), done.
remote: Compressing objects: 100% (59/59), done.
remote: Total 61 (delta 3), reused 26 (delta 0), pack-reused 0
接收对象中: 100% (61/61), 41.55 KiB | 184.00 KiB/s, 完成.
处理 delta 中: 100% (3/3), 完成.
➜  Source ls
wireguard-uapi-rs
➜  Source cd wireguard-uapi-rs 
➜  wireguard-uapi-rs git:(main) ls
Cargo.toml  examples  LICENSE  README.md  src  tests
➜  wireguard-uapi-rs git:(main) cargo build
    Updating `tuna` index
  Downloaded thiserror-impl v1.0.34 (registry `tuna`)
  Downloaded thiserror v1.0.34 (registry `tuna`)
  Downloaded 2 crates (32.8 KB) in 0.62s
.................
   Compiling wireguard-uapi v2.0.5 (/mnt/src/Source/wireguard-uapi-rs)
    Finished dev [unoptimized + debuginfo] target(s) in 15.90s
➜  wireguard-uapi-rs git:(main) find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip
➜  wireguard-uapi-rs git:(main) cargo test
  Downloaded difflib v0.4.0 (registry `tuna`)
........
  Downloaded 14 crates (464.7 KB) in 0.72s
   Compiling autocfg v1.1.0
.......
   Compiling wireguard-uapi v2.0.5 (/mnt/src/Source/wireguard-uapi-rs)
    Finished test [unoptimized + debuginfo] target(s) in 7.10s
     Running unittests src/lib.rs (target/debug/deps/wireguard_uapi-d837ef9f66bfa16f)

running 6 tests
test get::tests::parse_allowed_ip_ipv4 ... ok
test get::tests::parse_allowed_ip_ipv6 ... ok
test get::tests::parse_invalid_allowed_ip ... ok
test linux::socket::parse::tests::parse_device_example_from_man_page ... ok
test linux::socket::parse::tests::parse_device_example_from_man_page_pre_five_point_two_kernel ... ok
test linux::socket::parse::tests::parse_device_with_large_peer ... ok

test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/macos.rs (target/debug/deps/macos-77cc86e26f4d21df)

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/set_and_get.rs (target/debug/deps/set_and_get-0a81b834f134c7a9)

running 4 tests
Error: No ack received
Error: No ack received
Error: No ack received
test set_ifname_has_proper_padding ... FAILED
test peer_update_only ... FAILED
test simple ... FAILED
Error: No ack received
test large_peer ... FAILED

failures:

---- set_ifname_has_proper_padding stdout ----
thread 'set_ifname_has_proper_padding' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5

---- peer_update_only stdout ----
thread 'peer_update_only' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

---- simple stdout ----
wgtest47353
thread 'simple' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5

---- large_peer stdout ----
thread 'large_peer' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5

failures:
    large_peer
    peer_update_only
    set_ifname_has_proper_padding
    simple

test result: FAILED. 0 passed; 4 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

error: test failed, to rerun pass `--test set_and_get`

And the tests are also failed on my computer.

gluxon commented 1 year ago

It looks like the test binaries haven't been built by the time setcap is called in the output above. Could I have you try rerunning the find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip command again and give cargo test another shot? Thanks.

Running the tests require a bit of non-intuitive sequencing. In the output above, cargo test created the ./target/debug/deps binaries before running the tests since it was the first invocation of cargo test.

If the tests do run after that, I think you're setting the CAP_NET_ADMIN capability correctly for your target/debug/peerd file. If that's the case, it's definitely possible we have a bug somewhere in this library causing No ack received errors. Figuring out if the tests run correctly would be helpful to narrow that down. Thanks for your help.

xtexChooser commented 1 year ago

Ops, the test works.

gluxon commented 1 year ago

Apologies that it took me a bit to understand your bug report. I see that we get the ENODEV error.

5\x00\x01\x00\x00\x00\x24\x00\x03\x00\x50\x52\x53\xa9"...], 248, 0, NULL, 0) = 248                                                                                                           recvfrom(3, [{nlmsg_len=268, nlmsg_type=NLMSG_ERROR, nlmsg_flags=0, nlmsg_seq=1, nlmsg_pid=5347}, {error=-ENODEV, msg=[{nlmsg_len=248, nlmsg_type=wireguard, nlmsg_flags=NLM_F_REQUEST|NLM_F_

And the ACK after that.

ACK, nlmsg_seq=1, nlmsg_pid=0}, "\x01\x01\x00\x00\x09\x00\x02\x00\x74\x65\x73\x74\x00\x00\x00\x00\x08\x00\x05\x00\x01\x00\x00\x00\x24\x00\x03\x00\x00\x00\x00\x00"...]}], 32768, 0, NULL, NULL) = 268                                                                                                                                                                                     close(3)                                = 0

But the error reported is No ack received instead of ENODEV. So the error messaging can be much clearer here.

gluxon commented 1 year ago

@xtexChooser Does this block you from your work? I think improving the error message requires upgrading neli, which is something in progress.

xtexChooser commented 1 year ago

Not in a hurry, I have many other things I can do and it's not a big feature either, so I can just wait for the neli update.

xtexChooser commented 1 year ago

And I found something strange, the tests fails after a patch:

diff --git a/tests/set_and_get.rs b/tests/set_and_get.rs
index a2dd937..318a32e 100644
--- a/tests/set_and_get.rs
+++ b/tests/set_and_get.rs
@@ -7,7 +7,7 @@ use {

 #[cfg(target_os = "linux")]
 fn get_random_ifname() -> String {
-    format!("wgtest{}", rand::random::<u16>())
+    format!("test_test")
 }

 #[cfg(target_os = "linux")]

When I keep the following random number, it works. But it can not work without the random number.

➜  wireguard-uapi-rs git:(main) ✗ cargo test
   Compiling wireguard-uapi v2.0.5 (/mnt/src/Source/wireguard-uapi-rs)
    Finished test [unoptimized + debuginfo] target(s) in 0.94s
     Running unittests src/lib.rs (target/debug/deps/wireguard_uapi-d837ef9f66bfa16f)

running 6 tests
.......No ack fails......
error: test failed, to rerun pass `--test set_and_get`
➜  wireguard-uapi-rs git:(main) ✗ find ./target/debug/deps -maxdepth 1 -type f -executable | xargs -n 1 sudo setcap CAP_NET_ADMIN=+eip 
➜  wireguard-uapi-rs git:(main) ✗ cargo test
    Finished test [unoptimized + debuginfo] target(s) in 0.01s
     Running unittests src/lib.rs (target/debug/deps/wireguard_uapi-d837ef9f66bfa16f)

running 6 tests
test get::tests::parse_allowed_ip_ipv4 ... ok
test get::tests::parse_allowed_ip_ipv6 ... ok
test get::tests::parse_invalid_allowed_ip ... ok
test linux::socket::parse::tests::parse_device_example_from_man_page_pre_five_point_two_kernel ... ok
test linux::socket::parse::tests::parse_device_example_from_man_page ... ok
test linux::socket::parse::tests::parse_device_with_large_peer ... ok

test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/macos.rs (target/debug/deps/macos-77cc86e26f4d21df)

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running tests/set_and_get.rs (target/debug/deps/set_and_get-0a81b834f134c7a9)

running 4 tests
Error: No ack received
Error: No ack received
test set_ifname_has_proper_padding ... FAILED
test simple ... FAILED
Error: No ack received
test large_peer ... FAILED
test peer_update_only ... ok

failures:

---- set_ifname_has_proper_padding stdout ----
thread 'set_ifname_has_proper_padding' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5

---- simple stdout ----
test_test
thread 'simple' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

---- large_peer stdout ----
thread 'large_peer' panicked at 'assertion failed: `(left == right)`
  left: `1`,
 right: `0`: the test returned a termination value with a non-zero status code (1) which indicates a failure', /rustc/1120c5e01df508de64fe6642f22fadeb574afd6d/library/test/src/lib.rs:184:5

failures:
    large_peer
    set_ifname_has_proper_padding
    simple

test result: FAILED. 1 passed; 3 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.12s

error: test failed, to rerun pass `--test set_and_get`
gluxon commented 1 year ago

The test setup creates and deletes WireGuard devices through NETLINK_ROUTE. As you've noticed set_device fails on No ack received when the device doesn't exist yet, which happens as tests running in parallel create/delete the WireGuard device they're testing against.

xtexChooser commented 1 year ago

oh

gluxon commented 1 year ago

Yeah, the error is very confusing. 😞 Adding a test to make sure this gets revisited. https://github.com/gluxon/wireguard-uapi-rs/commit/33d2db3d499977c7d9868ff35a2c367f15d6b619