swanchain / go-computing-provider

A golang implementation of computing provider
MIT License
11 stars 15 forks source link

ubi-bench: error while loading shared libraries: libcuda.so.1: cannot open shared object file: No such file or directory #55

Closed ThomasBlock closed 2 months ago

ThomasBlock commented 2 months ago

i started with a fresh ubuntu22 and want to create a ECP. @Normalnoise

curl -fsSL https://raw.githubusercontent.com/swanchain/go-computing-provider/releases/ubi/setup.sh | bash wget https://github.com/swanchain/go-computing-provider/releases/download/v0.4.6/computing-provider

provider is running fine, but ubi does not work:

./computing-provider ubi list --show-failed
TASK ID TASK TYPE   ZK TYPE     TRANSACTION HASH    STATUS  REWARD  CREATE TIME         
114129  GPU         fil-c2-512M                     failed  0.0     2024-04-28 13:41:52 
114139  GPU         fil-c2-512M                     failed  0.0     2024-04-28 14:11:53
docker logs 649d748ca4ca
ubi-bench: error while loading shared libraries: libcuda.so.1: cannot open shared object file: No such file or directory
docker inspect 649d748ca4ca
[
    {
        "Id": "649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16",
        "Created": "2024-04-28T14:11:54.517699953Z",
        "Path": "ubi-bench",
        "Args": [
            "c2"
        ],
        "State": {
            "Status": "exited",
            "Running": false,
            "Paused": false,
            "Restarting": false,
            "OOMKilled": false,
            "Dead": false,
            "Pid": 0,
            "ExitCode": 127,
            "Error": "",
            "StartedAt": "2024-04-28T14:11:54.939584311Z",
            "FinishedAt": "2024-04-28T14:11:54.941096842Z"
        },
        "Image": "sha256:619d22d832c4c720abd06f5f78d3d5776c46d50748bf42adc24954056b02e5a3",
        "ResolvConfPath": "/var/lib/docker/containers/649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16/resolv.conf",
        "HostnamePath": "/var/lib/docker/containers/649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16/hostname",
        "HostsPath": "/var/lib/docker/containers/649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16/hosts",
        "LogPath": "/var/lib/docker/containers/649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16/649d748ca4ca93efc90940c0f6417e6e9927b9f5e981fbbe070a68d90ea82a16-json.log",
        "Name": "/fil-c2-512m-11413959042",
        "RestartCount": 0,
        "Driver": "overlay2",
        "Platform": "linux",
        "MountLabel": "",
        "ProcessLabel": "",
        "AppArmorProfile": "docker-default",
        "ExecIDs": null,
        "HostConfig": {
            "Binds": [
                "/home/user/param:/var/tmp/filecoin-proof-parameters"
            ],
            "ContainerIDFile": "",
            "LogConfig": {
                "Type": "json-file",
                "Config": {}
            },
            "NetworkMode": "bridge",
            "PortBindings": null,
            "RestartPolicy": {
                "Name": "no",
                "MaximumRetryCount": 0
            },
            "AutoRemove": false,
            "VolumeDriver": "",
            "VolumesFrom": null,
            "ConsoleSize": [
                0,
                0
            ],
            "CapAdd": null,
            "CapDrop": null,
            "CgroupnsMode": "private",
            "Dns": null,
            "DnsOptions": null,
            "DnsSearch": null,
            "ExtraHosts": null,
            "GroupAdd": null,
            "IpcMode": "private",
            "Cgroup": "",
            "Links": null,
            "OomScoreAdj": 0,
            "PidMode": "",
            "Privileged": false,
            "PublishAllPorts": false,
            "ReadonlyRootfs": false,
            "SecurityOpt": null,
            "UTSMode": "",
            "UsernsMode": "",
            "ShmSize": 67108864,
            "Runtime": "nvidia",
            "Isolation": "",
            "CpuShares": 0,
            "Memory": 5368709120,
            "NanoCpus": 0,
            "CgroupParent": "",
            "BlkioWeight": 0,
            "BlkioWeightDevice": null,
            "BlkioDeviceReadBps": null,
            "BlkioDeviceWriteBps": null,
            "BlkioDeviceReadIOps": null,
            "BlkioDeviceWriteIOps": null,
            "CpuPeriod": 0,
            "CpuQuota": 0,
            "CpuRealtimePeriod": 0,
            "CpuRealtimeRuntime": 0,
            "CpusetCpus": "",
            "CpusetMems": "",
            "Devices": null,
            "DeviceCgroupRules": null,
            "DeviceRequests": null,
            "MemoryReservation": 0,
            "MemorySwap": 10737418240,
            "MemorySwappiness": null,
            "OomKillDisable": null,
            "PidsLimit": null,
            "Ulimits": null,
            "CpuCount": 0,
            "CpuPercent": 0,
            "IOMaximumIOps": 0,
            "IOMaximumBandwidth": 0,
            "MaskedPaths": [
                "/proc/asound",
                "/proc/acpi",
                "/proc/kcore",
                "/proc/keys",
                "/proc/latency_stats",
                "/proc/timer_list",
                "/proc/timer_stats",
                "/proc/sched_debug",
                "/proc/scsi",
                "/sys/firmware",
                "/sys/devices/virtual/powercap"
            ],
            "ReadonlyPaths": [
                "/proc/bus",
                "/proc/fs",
                "/proc/irq",
                "/proc/sys",
                "/proc/sysrq-trigger"
            ]
        },
        "GraphDriver": {
            "Data": {
                "LowerDir": "/var/lib/docker/overlay2/3e3d1de88d361aa83649a4efd659ce82293d3ccf3349d5bb59f900b66cc09065-init/diff:/var/lib/docker/overlay2/623ed17a5d94b7df296e9ca3c945346833abebaf142f8bb1df0b512fb3e72d08/diff:/var/lib/docker/overlay2/889c37bd0fd1aca674886f4a4ffa1c609a031101e243c5c18af1d1b75b3a5989/diff:/var/lib/docker/overlay2/916668dd30dd37d07bd38e7455fcb444d6e63dbed01aed75de0150ccfe769ff5/diff:/var/lib/docker/overlay2/ea83943f1a32bcbbc4186308cb4721ed686f8ae6a36bbade913512482147957f/diff",
                "MergedDir": "/var/lib/docker/overlay2/3e3d1de88d361aa83649a4efd659ce82293d3ccf3349d5bb59f900b66cc09065/merged",
                "UpperDir": "/var/lib/docker/overlay2/3e3d1de88d361aa83649a4efd659ce82293d3ccf3349d5bb59f900b66cc09065/diff",
                "WorkDir": "/var/lib/docker/overlay2/3e3d1de88d361aa83649a4efd659ce82293d3ccf3349d5bb59f900b66cc09065/work"
            },
            "Name": "overlay2"
        },
        "Mounts": [
            {
                "Type": "bind",
                "Source": "/home/user/param",
                "Destination": "/var/tmp/filecoin-proof-parameters",
                "Mode": "",
                "RW": true,
                "Propagation": "rprivate"
            }
        ],
        "Config": {
            "Hostname": "649d748ca4ca",
            "Domainname": "",
            "User": "",
            "AttachStdin": false,
            "AttachStdout": true,
            "AttachStderr": true,
            "Tty": true,
            "OpenStdin": false,
            "StdinOnce": false,
            "Env": [
                "RECEIVE_PROOF_URL=http://192.168.128.69:40037/api/v1/computing/cp/docker/receive/ubi",
                "TASKID=114139",
                "TASK_TYPE=1",
                "ZK_TYPE=fil-c2-512M",
                "NAME_SPACE=docker-ubi-task",
                "PARAM_URL=https://286cb2c989.acl.swanipfs.com/ipfs/QmYaeDVMb9KsyPNtSPsGPzbGhSkkHuUUUex9JYoYBnUWcN",
                "RUST_GPU_TOOLS_CUSTOM_GPU=GeForce RTX 4080:9728",
                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
                "TRUST_PARAMS=1",
                "RUST_LOG=Info",
                "FILECOIN_PARAMETER_CACHE=/var/tmp/filecoin-proof-parameters",
                "DEBIAN_FRONTEND=noninteractive"
            ],
            "Cmd": [
                "ubi-bench",
                "c2"
            ],
            "Image": "filswan/ubi-worker-gpu-amd:v2.0",
            "Volumes": {
                "/var/tmp/filecoin-proof-parameters": {}
            },
            "WorkingDir": "",
            "Entrypoint": null,
            "OnBuild": null,
            "Labels": {
                "org.opencontainers.image.ref.name": "ubuntu",
                "org.opencontainers.image.version": "20.04"
            }
        },
        "NetworkSettings": {
            "Bridge": "",
            "SandboxID": "1c42a117a6d2698478ce339525e5b033e39da6610eca9e09b975a624844868cf",
            "SandboxKey": "/var/run/docker/netns/1c42a117a6d2",
            "Ports": {},
            "HairpinMode": false,
            "LinkLocalIPv6Address": "",
            "LinkLocalIPv6PrefixLen": 0,
            "SecondaryIPAddresses": null,
            "SecondaryIPv6Addresses": null,
            "EndpointID": "",
            "Gateway": "",
            "GlobalIPv6Address": "",
            "GlobalIPv6PrefixLen": 0,
            "IPAddress": "",
            "IPPrefixLen": 0,
            "IPv6Gateway": "",
            "MacAddress": "",
            "Networks": {
                "bridge": {
                    "IPAMConfig": null,
                    "Links": null,
                    "Aliases": null,
                    "MacAddress": "",
                    "NetworkID": "d015f93e6844dc1c3e34ca0ed3b4bc817b626307806c6bf57cf7da44e775f8e4",
                    "EndpointID": "",
                    "Gateway": "",
                    "IPAddress": "",
                    "IPPrefixLen": 0,
                    "IPv6Gateway": "",
                    "GlobalIPv6Address": "",
                    "GlobalIPv6PrefixLen": 0,
                    "DriverOpts": null,
                    "DNSNames": null
                }
            }
        }
    }
]
"cluster_info": [
    {
      "machine_id": "bc8b4851-91fc-489f-b345-6f0ca2a2388f",
      "cpu_name": "AMD",
      "cpu": {
        "total": "24",
        "used": "0",
        "free": "24"
      },
      "vcpu": {
        "total": "24",
        "used": "0",
        "free": "24"
      },
      "memory": {
        "total": "24 GiB",
        "used": "0 GiB",
        "free": "23 GiB"
      },
      "gpu": {
        "driver_version": "535.171.04",
        "cuda_version": "12020",
        "attached_gpus": 1,
        "details": [
          {
            "product_name": "NVIDIA 4080",
            "status": "available",
            "fb_memory_usage": {
              "total": "16376 MiB",
              "used": "297 MiB",
              "free": "16078 MiB"
            },
            "bar1_memory_usage": {
              "total": "256 MiB",
              "used": "2 MiB",
              "free": "253 MiB"
            }
          }
        ]
      },
      "storage": {
        "total": "389 GiB",
        "used": "161 GiB",
        "free": "228 GiB"
      }
    }
  ],
Normalnoise commented 2 months ago

I suggest you use releases branch to rebuild the cp. we have fix some gpu bug

ThomasBlock commented 2 months ago

I suggest you use releases branch to rebuild the cp. we have fix some gpu bug

ah yes thank you it worked with computing-provider version 0.4.6+git.d95eae1

make install somehow did not overwrite the old file, so i had to copy it manually

image