Integrate selkies-gstreamer to Jupyter and Coder (just like noVNC)

ehfd commented 2 years ago

Update:

We currently have this Coder configuration working self-hosted:

entrypoint.sh for Coder:

set -e
supervisord

Open main.tf

```terraform terraform { required_providers { coder = { source = "coder/coder" version = "~> 0.6.17" } kubernetes = { source = "hashicorp/kubernetes" version = "~> 2.18" } } } provider "coder" { feature_use_managed_variables = true } provider "kubernetes" { config_path = var.use_kubeconfig == true ? "~/.kube/config" : null } data "coder_workspace" "me" {} data "coder_parameter" "cpu" { name = "CPU (cores)" default = "2" icon = "/icon/memory.svg" mutable = true option { name = "2 Cores" value = "2" } option { name = "4 Cores" value = "4" } option { name = "6 Cores" value = "6" } option { name = "8 Cores" value = "8" } option { name = "16 Cores" value = "16" } } data "coder_parameter" "memory" { name = "Memory (GB)" default = "2" icon = "/icon/memory.svg" mutable = true option { name = "2 GB" value = "2" } option { name = "4 GB" value = "4" } option { name = "6 GB" value = "6" } option { name = "8 GB" value = "8" } option { name = "16 GB" value = "16" } option { name = "32 GB" value = "32" } option { name = "64 GB" value = "64" } } data "coder_parameter" "password" { name = "password" display_name = "Selkies Password" description = "The Selkies password for authentication. User is ubuntu." icon = "/emojis/1f511.png" mutable = false option { name = "mypasswd" value = "mypasswd" } } data "coder_parameter" "home_disk_size" { name = "home_disk_size" display_name = "Home disk size" description = "The size of the home disk in GB" default = "100" type = "number" icon = "/emojis/1f4be.png" mutable = false validation { min = 1 max = 99999 } } variable "use_kubeconfig" { type = bool description = "Use host kubeconfig? (true/false)" default = false } variable "namespace" { type = string description = "The Kubernetes namespace to create workspaces in (must exist prior to creating workspaces)" default = "coder" } resource "kubernetes_pod" "main" { count = data.coder_workspace.me.start_count metadata { name = "coder-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" namespace = var.namespace labels = { "app.kubernetes.io/name" = "coder-workspace" "app.kubernetes.io/instance" = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" "app.kubernetes.io/part-of" = "coder" // Coder specific labels. "com.coder.resource" = "true" "com.coder.workspace.id" = data.coder_workspace.me.id "com.coder.workspace.name" = data.coder_workspace.me.name "com.coder.user.id" = data.coder_workspace.me.owner_id "com.coder.user.username" = data.coder_workspace.me.owner } annotations = { "com.coder.user.email" = data.coder_workspace.me.owner_email } } spec { container { name = "dev" image = "ghcr.io/selkies-project/nvidia-glx-desktop:latest" image_pull_policy = "Always" command = ["sh", "-c", coder_agent.main.init_script] # security_context { # run_as_user = "1000" # privileged = true # } env { name = "TZ" value = "UTC" } env { name = "DISPLAY_SIZEW" value = "1920" } env { name = "DISPLAY_SIZEH" value = "1080" } env { name = "DISPLAY_REFRESH" value = "60" } env { name = "DISPLAY_DPI" value = "96" } env { name = "DISPLAY_CDEPTH" value = "24" } env { name = "VIDEO_PORT" value = "DFP" } env { name = "PASSWD" value = "${data.coder_parameter.password.value}" } env { name = "SELKIES_ENCODER" value = "nvh264enc" } env { name = "SELKIES_ENABLE_RESIZE" value = "false" } env { name = "SELKIES_VIDEO_BITRATE" value = "8000" } env { name = "SELKIES_FRAMERATE" value = "60" } env { name = "SELKIES_AUDIO_BITRATE" value = "128000" } env { name = "SELKIES_ENABLE_BASIC_AUTH" value = "true" } env { name = "SELKIES_BASIC_AUTH_PASSWORD" value = "${data.coder_parameter.password.value}" } env { name = "SELKIES_TURN_REST_URI" value = "http://turn-rest.nrp-nautilus.io" } env { name = "SELKIES_TURN_PROTOCOL" value = "tcp" } env { name = "SELKIES_TURN_TLS" value = "false" } env { name = "CODER_AGENT_TOKEN" value = coder_agent.main.token } stdin = true tty = true port { name = "http" container_port = 8080 protocol = "TCP" } resources { limits = { "cpu" = "${data.coder_parameter.cpu.value}" "memory" = "${data.coder_parameter.memory.value}Gi" "nvidia.com/gpu" = 1 } requests = { "cpu" = "${data.coder_parameter.cpu.value}" "memory" = "${data.coder_parameter.memory.value}Gi" "nvidia.com/gpu" = 1 } } volume_mount { mount_path = "/home/ubuntu/persistent" name = "home" read_only = false sub_path = "home" } volume_mount { mount_path = "/dev/shm" name = "dshm" } } dns_policy = "None" dns_config { nameservers = ["8.8.8.8", "8.8.4.4"] } volume { name = "dshm" empty_dir {} } volume { name = "home" persistent_volume_claim { claim_name = kubernetes_persistent_volume_claim.home.metadata.0.name read_only = false } } affinity { node_affinity { required_during_scheduling_ignored_during_execution { node_selector_term { match_expressions { key = "topology.kubernetes.io/zone" operator = "NotIn" values = ["myzone"] } } } } } } } resource "coder_agent" "main" { os = "linux" arch = "amd64" login_before_ready = false startup_script_timeout = 180 startup_script = <<-EOT set -e # install and start code-server curl -fsSL https://code-server.dev/install.sh | sh -s -- --method=standalone --prefix=/tmp/code-server --version 4.8.3 /tmp/code-server/bin/code-server --auth none --port 13337 >/tmp/code-server.log 2>&1 & echo "Initializing Supervisor..." nohup supervisord EOT } resource "coder_app" "code-server" { agent_id = coder_agent.main.id slug = "code-server" display_name = "code-server" icon = "/icon/code.svg" url = "http://localhost:13337?folder=/home/coder" subdomain = false share = "owner" healthcheck { url = "http://localhost:13337/healthz" interval = 3 threshold = 10 } } resource "kubernetes_persistent_volume_claim" "home" { metadata { name = "coder-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}-home" namespace = var.namespace labels = { "app.kubernetes.io/name" = "coder-pvc" "app.kubernetes.io/instance" = "coder-pvc-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" "app.kubernetes.io/part-of" = "coder" // Coder specific labels. "com.coder.resource" = "true" "com.coder.workspace.id" = data.coder_workspace.me.id "com.coder.workspace.name" = data.coder_workspace.me.name "com.coder.user.id" = data.coder_workspace.me.owner_id "com.coder.user.username" = data.coder_workspace.me.owner } annotations = { "com.coder.user.email" = data.coder_workspace.me.owner_email } } wait_until_bound = false spec { access_modes = ["ReadWriteOnce"] storage_class_name = "rook-ceph-block" resources { requests = { storage = "${data.coder_parameter.home_disk_size.value}Gi" } } } } resource "coder_app" "selkies" { agent_id = coder_agent.main.id slug = "selkies" display_name = "Selkies" icon = "/emojis/1f3ae.png" url = "http://localhost:8080" subdomain = true share = "owner" } ```

Self-explanatory. Just like VS Code Server and noVNC, a button click in Jupyter should lead to a window with Selkies. This will help greatly in robotics, simulations, and other kinds of research.

External contribution dearly expected.

It can be a separate project, a PR, or any other form of contribution. Integration with Jupyter Docker containers should also be possible.

Creating a template for Coder would also be of interest.

Inrixia commented 4 months ago

Messing around trying to get this working with coder.

Can access the web endpoint fine but webrtc fails.

When using coder port-forward to open needed ports for a direct connection webrtc also fails as the signaling handshake specifies the private host ip of the container which is not resolvable eg 10.x.x.x vs 127.0.0.1.

Ideally there would be a way to manually specify the ports/ip used to get around this, though I'm not sure how tenable that is.

I haven't tested using a turn server because if I have to fall back to that I'll just end up going with something like KasmVNC which works seamlessly over a single port.

Inrixia commented 4 months ago

This looks super promising. Will check it out tomorrow, thanks!

Inrixia commented 4 months ago

Hey, have messed around a bit with this. Had to switch to using the egl container as I am not using a gpu for testing.

While this boots and works fine. It still requires a turn server which does not work due to firewall restrictions.

Plus I cannot justify needing to run a seperate turn server.

How difficult would it be to be able to pin ports needed so that a direct connection could be established using coder port-forward

ehfd commented 4 months ago

https://github.com/selkies-project/docker-nvidia-egl-desktop#running-with-kubernetes

Maybe the internal TURN server option could work here.

Inrixia commented 4 months ago

That might work since can limit it to only two ports. Will give it a try and see!

Though actually may run into the issue earlier where the internal ip of the pod doesn't match the forwarded (127.0.0.1). Will update how it goes

Inrixia commented 4 months ago

Yea so when doing ice negotiation it is advertising the internal pod ips (10.x.x.x) so ice still fails as that address is not resolvable. It has to advertise something like 127.0.0.1 or local host etc so that the client attempts to connect via the port forward.

I ran into this issue when originally testing as mentioned above.

Having a option to override what ip is advertised would probably fix this.

Inrixia commented 4 months ago

Actually I don't seem to be able to get it to use the internal turn server.

Checking the logs its still advertising defaults

Inrixia commented 4 months ago

Wait I'm being dumb. Was missing the turn host env var, it's further down the docs and didn't notice it.

Inrixia commented 4 months ago

Hmm, now I'm getting proper advertising but even though the internal server advertises 65535 as a option it is still listening on the old ports and not 65535

ehfd commented 4 months ago

@Inrixia What's your precise settings?

Inrixia commented 4 months ago

Same as the template you gave.

But only turn settings are min port, max port, turn port, turn host, turn protocol

Inrixia commented 4 months ago

The turn port is listened on.

But the min and max ports (65534-65535) are not. So when they are advertised they are unreachable afik

ehfd commented 4 months ago

Very strange. You specified TURN_MIN_PORT and TURN_MAX_PORT?

Inrixia commented 4 months ago

Yep. If you have the chance can you share a working config? Maybe I'm missing something obvious.

This is running on a normal coder deployment on aks and I can see what ports the container is listening on.

I'm using the coder cli to port forward and it works fine. Just need to get the ice negotiation to use the right ports.

Setting the turn host ip fixed the issue with it not using 127... Too so afik it's just the underlying server not listening on the range that's the issue.

ehfd commented 4 months ago

If you do not specify anything on SELKIES_TURN_HOST, it will automatically resolve to the external IP of that instance.

Let me check something quickly now about the port range.

Inrixia commented 4 months ago

Yep, the ip issues were resolved by specifying the host arg. Just the ports listening issue now afik

ehfd commented 4 months ago

@Inrixia In top or htop, could you get the PID of any of the turnserver process and do cat /proc/${PID}/cmdline | tr '\000' ' ' for ${PID}?

ehfd commented 4 months ago

I need to see the input for:

# Configure coTURN script
RUN echo "#!/bin/bash\n\
set -e\n\
turnserver \
    --verbose \
    --listening-ip=\"0.0.0.0\" \
    --listening-ip=\"::\" \
    --listening-port=\"\${SELKIES_TURN_PORT:-3478}\" \
    --realm=\"\${TURN_REALM:-example.com}\" \
    --external-ip=\"\${TURN_EXTERNAL_IP:-\$(dig TXT +short @ns1.google.com o-o.myaddr.l.google.com 2>/dev/null | { read output; if [ -z \"\$output\" ] || echo \"\$output\" | grep -q '^;;'; then exit 1; else echo \"\$(echo \$output | sed 's,\\\",,g')\"; fi } || dig -6 TXT +short @ns1.google.com o-o.myaddr.l.google.com 2>/dev/null | { read output; if [ -z \"\$output\" ] || echo \"\$output\" | grep -q '^;;'; then exit 1; else echo \"\$(echo \$output | sed 's,\\\",,g')\"; fi } || hostname -I 2>/dev/null | awk '{print \$1; exit}' || echo '127.0.0.1')}\" \
    --min-port=\"\${TURN_MIN_PORT:-49152}\" \
    --max-port=\"\${TURN_MAX_PORT:-65535}\" \
    --channel-lifetime=\"\${TURN_CHANNEL_LIFETIME:--1}\" \
    --lt-cred-mech \
    --user \"selkies:\${TURN_RANDOM_PASSWORD}\" \
    --no-cli \
    --cli-password=\"\${TURN_RANDOM_PASSWORD:-\$(tr -dc 'A-Za-z0-9' < /dev/urandom 2>/dev/null | head -c 24)}\" \
    --allow-loopback-peers \
    \${TURN_EXTRA_ARGS} \$@\
" > /etc/start-turnserver.sh && chmod -f 755 /etc/start-turnserver.sh

Inrixia commented 4 months ago

turnserver --verbose --listening-ip=0.0.0.0 --listening-ip=:: --listening-port=3478 --realm=example.com --external-ip=127.0.0.1 --min-port=65534 --max-port=65535 --channel-lifetime=-1 --lt-cred-mech --user selkies:REDACTED --no-cli --cli-password=REDACTED --allow-loopback-peers

ehfd commented 4 months ago

External IP SHOULD NOT be 127.0.0.1. It should be your real IP that clients should know (public IP if over the internet, private IP if confined to LAN).

Otherwise, I don't see why opening 65534 and 65535 in the Coder configuration shouldn't work.

Inrixia commented 4 months ago

I'm accessing the coder service via coder port-forward <workspace> ...ports... which lets me hit the webui for example via https://127.0.0.1:8081

So that should be fine. I have ports 8081, 3478 & 65534-65535 forwarded on tcp/udp. Coder reports ports that it sees as open and 65534-65535 is not listed but 8081, 3478 and other ports are. When attempting to connect other ports like 58xxx are also opened so clearly the server is trying to negotiate but using the wrong ports.

ehfd commented 4 months ago

Port should be 8080. It should go through NGINX because the interfaces themselves only allow loopback access for security.

Inrixia commented 4 months ago

Tried 8080 same thing. Though I did notice that the advertisements do still have the inaccessible internal ip in them.

candidate:9 1 UDP 337658111 127.0.0.1 65534 typ relay raddr 10.224.1.158 rport 9

10.224.1.158 is a pod ip that is not accessible.

ehfd commented 4 months ago

Then, you should change 10.224.1.158 to an accessible public IP in SELKIES_TURN_HOST.

ehfd commented 4 months ago

Since you are using Azure, specifying the external IP is required.

Inrixia commented 4 months ago

It doesn't have a external ip. And I have set SELKIES_TURN_HOST to 127.0.0.1 as mentioned, you can see above that 127.0.0.1 is part of the advertisement but the internal ip is still being used for some of it.

ehfd commented 4 months ago

When you set SELKIES_TURN_HOST to 127.0.0.1, this means that the client is the same as the server. Any other client will not know where the (internal) TURN server will be.

I'm afraid that if this is indeed the case, an external TURN server is needed.

Inrixia commented 4 months ago

Isn't SELKIES_TURN_HOST just the ip that the local turn server advertises to client's? So since I am forwarding the internal turn server port to 127 it should work fine?

ehfd commented 4 months ago

SELKIES_TURN_HOST is also the TURN server hostname that the client looks to connect to (and TURN_EXTERNAL_IP is a hostname-IP conversion of that by default). Without it set to the correct hostname or IP, the client can't find the TURN server.

So if the container node is located in myazurenode.msazure.com, the client looks for turn:myazurenode.msazure.com:3478

Inrixia commented 4 months ago

Then 127 should work fine, as the client can connect to 127.0.0.1:3478 as it has been forwarded using coder port-forward

ehfd commented 4 months ago

Then 127 should work fine, as the client can connect to 127.0.0.1:3478 as it has been forwarded using coder port-forward

I'm not sure WebRTC TURN works this way. Can't guarantee that this will work. Try forwarding the TURN_MIN_PORT to TURN_MAX_PORT port combinations as well then.

Inrixia commented 4 months ago

Already am. From the clients perspective the server is accessible on 127.0.0.1 on ports 8080, 3478, 65534, 65535 I can view the web ui via https://127.0.0.1:8080 But it fails to connect

ehfd commented 4 months ago

Second, are you using UDP TURN? Does Coder do UDP port forwarding?

Inrixia commented 4 months ago

Yep, forwarding both udp/tcp to be sure but it is set to use udp

ehfd commented 4 months ago

Try changing it to TCP first. I heard Coder doesn't play well with UDP.

Inrixia commented 4 months ago

No change, there is some advertisements for tcp but they aren't for the turn server port (65535) so the usual negotiation.

The weird thing is the turn server announcement looks correct, but according to coder the port 65535 is never actually opened

ehfd commented 4 months ago

Try SELKIES_TURN_PROTOCOL to tcp?

The weird thing is the turn server announcement looks correct, but according to coder the port 65535 is never actually opened

This is because it's only opened when the WebRTC connection is established. It doesn't open before that.

ehfd commented 4 months ago

If none of these work, I don't see a choice but to use an external TURN server. It could be that port-forwarding is not something that TURN works with.

Inrixia commented 4 months ago

Try SELKIES_TURN_PROTOCOL to tcp?

Yep tried that

It could be that port-forwarding is not something that TURN works with.

I really doubt it since if the ports are open it should work. But if they are being dynamically created which looks to be the case then yes.

Though the existing turn server should stop that from happening. Maybe I could try a custom stun server that lets me force specific ports to be used but ehh that may not work either.

Let me know if you have any other ideas if not then no problem

ehfd commented 4 months ago

I really doubt it since if the ports are open it should work. But if they are being dynamically created which looks to be the case then yes.

Port forwarding is a very special case here for WebRTC. It wasn't designed for port forwarding.

Perhaps, change the TURN server script to --listening-ip=127.0.0.1, then.

ehfd commented 4 months ago

Try setting TURN_EXTRA_ARGS to --listening-ip="127.0.0.1".

Inrixia commented 4 months ago

I'll give it a try tomorrow. Though afik the turn server is listening/accessible fine just the client isn't reaching out to it (I can see the port being accessible)

ehfd commented 4 months ago

Thanks for your patience.

Inrixia commented 4 months ago

Oh no issue, thank you for taking the time to try work though this with me.

Will update when I get the chance tomorrow.

Inrixia commented 4 months ago

Turn extra args didn't help as expected.

Inrixia commented 4 months ago

I was able to open a large port range and retry connecting until I eventually got two ports used that were in range and it still didn't work so I'm thinking the issue may be more complex than just port access.

Honestly at this point I'm unsure why it's actually failing to connect.

Inrixia commented 4 months ago

I suspect that it's not trying to use the turn server at all, the ice server is still returning the internal ip so failure to connect on that may be causing it to prematurely fail instead of falling back on turn.

Ideally there would be a option to just specify the websocket ports and host ip used instead of trying to autonegotate.

ehfd commented 4 months ago

There's a force relay option at the bottom of the web interface dashboard. Otherwise, using the external TURN server is the most stable way.

Inrixia commented 4 months ago

That's perfect! Thank you, by selecting that the client is now correctly using the internal turn server on 127.0.0.1 and a connection is established!

Though I am getting another issue now.

Everything looks fine, logs say audio and video stream is received but the client is just stuck on displaying "waiting for stream."

Inrixia commented 4 months ago

Though despite the logs saying it's received the stream the steam the stats above it still say connecting and peer connection type unknown.

selkies-project / selkies-gstreamer

Integrate selkies-gstreamer to Jupyter and Coder (just like noVNC) #64