canonical / lxd

Powerful system container and virtual machine manager
https://canonical.com/lxd
GNU Affero General Public License v3.0
4.38k stars 931 forks source link

Java8 fails to get Runtime.getRuntime().availableProcessors() #10223

Closed ui-paul-kasemir closed 2 years ago

ui-paul-kasemir commented 2 years ago

Required information

config:
  core.trust_password: true
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- snapshot_schedule_aliases
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- rbac
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
- network_state_vlan
- instance_nic_bridged_port_isolation
- instance_bulk_state_change
- network_gvrp
- instance_pool_move
- gpu_sriov
- pci_device_type
- storage_volume_state
- network_acl
- migration_stateful
- disk_state_quota
- storage_ceph_features
- projects_compression
- projects_images_remote_cache_expiry
- certificate_project
- network_ovn_acl
- projects_images_auto_update
- projects_restricted_cluster_target
- images_default_architecture
- network_ovn_acl_defaults
- gpu_mig
- project_usage
- network_bridge_acl
- warnings
- projects_restricted_backups_and_snapshots
- clustering_join_token
- clustering_description
- server_trusted_proxy
- clustering_update_cert
- storage_api_project
- server_instance_driver_operational
- server_supported_storage_drivers
- event_lifecycle_requestor_address
- resources_gpu_usb
- clustering_evacuation
- network_ovn_nat_address
- network_bgp
- network_forward
- custom_volume_refresh
- network_counters_errors_dropped
- metrics
- image_source_project
- clustering_config
- network_peer
- linux_sysctl
- network_dns
- ovn_nic_acceleration
- certificate_self_renewal
- instance_project_move
- storage_volume_project_move
- cloud_init
- network_dns_nat
- database_leader
- instance_all_projects
- clustering_groups
- ceph_rbd_du
- instance_get_full
- qemu_metrics
- gpu_mig_uuid
- event_project
- clustering_evacuation_live
- instance_allow_inconsistent_copy
- network_state_ovn
- storage_volume_api_filtering
- image_restrictions
- storage_zfs_export
- network_dns_records
- storage_zfs_reserve_space
- network_acl_log
- storage_zfs_blocksize
- metrics_cpu_seconds
- instance_snapshot_never
- certificate_token
- instance_nic_routed_neighbor_probe
- event_hub
- agent_nic_config
- projects_restricted_intercept
- metrics_authentication
- images_target_project
- cluster_migration_inconsistent_copy
- cluster_ovn_chassis
- container_syscall_intercept_sched_setscheduler
- storage_lvm_thinpool_metadata_size
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
  addresses: []
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
    MIICHDCCAaKgAwIBAgIRAKSZwLC+18iPCgYiOs6GnzAwCgYIKoZIzj0EAwMwPDEc
    MBoGA1UEChMTbGludXhjb250YWluZXJzLm9yZzEcMBoGA1UEAwwTcm9vdEBwYXVs
    LXhwczE3OTcxMDAeFw0yMjAzMDMyMTI0NTdaFw0zMjAyMjkyMTI0NTdaMDwxHDAa
    BgNVBAoTE2xpbnV4Y29udGFpbmVycy5vcmcxHDAaBgNVBAMME3Jvb3RAcGF1bC14
    cHMxNzk3MTAwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAQqB+qC7SJQ4d9haTkukXRe
    vWvzWFbqcL0ggg6sThAxVZkLmVd8wqKPTeTkurpX2AZ7/DUisoPExEjyJTX4UBBX
    /sut/IQcKNW92MkXhRMUyvbyefVMzR7HF4+kR5ubYoqjaDBmMA4GA1UdDwEB/wQE
    AwIFoDATBgNVHSUEDDAKBggrBgEFBQcDATAMBgNVHRMBAf8EAjAAMDEGA1UdEQQq
    MCiCDnBhdWwteHBzMTc5NzEwhwR/AAABhxAAAAAAAAAAAAAAAAAAAAABMAoGCCqG
    SM49BAMDA2gAMGUCMFpuaXzRNioy+ILpdHgPF3R3uVebsczZoxrRVWs92iDILl1s
    hCnhmcwtbvoY3i3h1wIxALk4RhiNhJNqvVaCVEDeDThKqHirlor/0bwuk+9k0BIS
    YUkdGvO5+jzFh0tZ/F/pBg==
    -----END CERTIFICATE-----
  certificate_fingerprint: 498ad8410b0b188b38522061659fbf8de8c908d67f2b173a335d8883c396b821
  driver: lxc | qemu
  driver_version: 4.0.12 | 6.2.0
  firewall: nftables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    idmapped_mounts: "true"
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    shiftfs: "false"
    uevent_injection: "true"
    unpriv_fscaps: "true"
  kernel_version: 5.17.1-zen1-1-zen
  lxc_features:
    cgroup2: "true"
    core_scheduling: "true"
    devpts_fd: "true"
    idmapped_mounts_v2: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_allow_deny_syntax: "true"
    seccomp_notify: "true"
    seccomp_proxy_send_notify_fd: "true"
  os_name: Garuda Linux
  os_version: ""
  project: default
  server: lxd
  server_clustered: false
  server_event_mode: full-mesh
  server_name: paul-xps179710
  server_pid: 159518
  server_version: 5.0.0
  storage: btrfs
  storage_version: 5.16.2
  storage_supported_drivers:
  - name: btrfs
    version: 5.16.2
    remote: false
  - name: dir
    version: "1"
    remote: false
  - name: lvm
    version: 2.03.15(2) (2022-02-07) / 1.02.183 (2022-02-07) / 4.45.0
    remote: false

Issue description

The "infallible" java8 function call Runtime.getRuntime().availableProcessors() is returning 0. The docs page says:

never smaller than one

See https://www.tutorialspoint.com/java/lang/runtime_availableprocessors.htm

Howerever if using a slightly newer java version11 it works as expected

root@jdemo:~# java -version
openjdk version "11.0.14.1" 2022-02-08
OpenJDK Runtime Environment (build 11.0.14.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.14.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
root@jdemo:~# java RuntimeDemo 
Hello world!
16

Steps to reproduce

  1. lxc launch ubuntu:bionic jdemo
  2. lxc shell jdemo
    • apt update
    • apt install openjdk-8-jdk-headless
    • write file RuntimeDemo.java
      
      public class RuntimeDemo {

    public static void main(String[] args) {

    // print a normal message System.out.println("Hello world!");

    // check the number of processors available System.out.println(""+Runtime.getRuntime().availableProcessors()); } }

  3. javac RuntimeDemo.java
  4. java RuntimeDemo
    Hello world!
    0

Information to attach

stgraber commented 2 years ago
root@b1:~# java RuntimeDemo
Hello world!
16
root@b1:~# dpkg -l | grep jre
ii  openjdk-8-jre-headless:amd64 8u312-b07-0ubuntu1~18.04      amd64        OpenJDK Java runtime, using Hotspot JIT (headless)
root@b1:~# java -version
openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)
root@b1:~# 
ui-paul-kasemir commented 2 years ago

@stgraber what is your host machine running? And what version of LXD

stgraber commented 2 years ago

Ubuntu 22.04 and LXD 5.0

stgraber commented 2 years ago

Same thing on an Ubuntu 20.04 + LXD 5.0 setup

stgraber commented 2 years ago

Closing as this can't be a LXD bug anyway, LXD just sets some cpuset pinning in the kernel.

This kind of issue could come from a broken LXCFS setup. Looking at what Java is doing, it seems to expect /sys/devices/system/cpu to be readable and /sys/devices/system/cpu/online to show something that makes sense.

ui-paul-kasemir commented 2 years ago

I'm using Garuda Linux (which is an arch linux)

How can you safely close this ticket @stgraber without testing the host I'm using? Besides the fact that it is an issue with LXD. Using a pre 5.0 LXD it was working okay.

ui-paul-kasemir commented 2 years ago

Unless you are assuming LXD will only be working/supported for Ubuntu

stgraber commented 2 years ago

@ui-paul-kasemir because LXD hasn't changed the way it sets up cgroups in years and the files that are being accessed by that bit of Java are not controlled by LXD at all.

You're most likely seeing a difference in LXCFS behavior, not LXD.

stgraber commented 2 years ago
[root@archlinux ~]# cat /etc/os-release 
NAME="Arch Linux"
PRETTY_NAME="Arch Linux"
ID=arch
BUILD_ID=rolling
ANSI_COLOR="38;2;23;147;209"
HOME_URL="https://archlinux.org/"
DOCUMENTATION_URL="https://wiki.archlinux.org/"
SUPPORT_URL="https://bbs.archlinux.org/"
BUG_REPORT_URL="https://bugs.archlinux.org/"
LOGO=archlinux-logo
[root@archlinux ~]# lxc exec b1 bash
root@b1:~# java RuntimeDemo
Hello world!
2
root@b1:~# dpkg -l | grep jre
ii  openjdk-8-jre-headless:amd64 8u312-b07-0ubuntu1~18.04      amd64        OpenJDK Java runtime, using Hotspot JIT (headless)
root@b1:~# exit
[root@archlinux ~]# lxc config set b1 limits.cpu=1
[root@archlinux ~]# lxc exec b1 bash
root@b1:~# java RuntimeDemo
Hello world!
1
root@b1:~# 
ui-paul-kasemir commented 2 years ago

@stgraber You are right. If I downgrade lxcfs it starts working again. I'll file a ticket there.

lxcfs-4.0.12-1: success :heavy_check_mark: Available processors: 16

lxcfs-5.0.0-1: fails :x: Available processors: 0

ui-paul-kasemir commented 2 years ago

@stgraber what version of lxcfs are you using?

stgraber commented 2 years ago

I'm using LXCFS 5.0.0 but with a bunch of cherry-picked fixes:

      git cherry-pick 42c155d1df92d8bbabdb1de3433048584bb05d47  # lxcfs_fuse: ensure lxcfs_fuse_compat.h is included after including fuse header
      git cherry-pick e297458cb0d2a5c7f91ddacc771d4cf9efe301f8  # tree-wide: remove struct stat argument from DIR_FILLER and make it static inline
      git cherry-pick d58528f752aaa532eb3871899a27d66996a2a798  # tree-wide: ensure that file information is set even with legacy fuse
      git cherry-pick 02beb5d4e1b82e3416988af2341142518c804bd3  # tests: add test for issue #522
      git cherry-pick 112f0b9ca720b8f0da4d4a39153c16f55e3294e1  # meson: Include documentation
      git cherry-pick 1ef15e0d1f345ebb851f1bab8835cace5279e392  # workflows: disable documentation generation
      git cherry-pick 9d6cded612483cfb31919f306d1975ce801d8e25  # sysfs: Don't incorrectly filter entries
      git cherry-pick dcc13128a03b8af4e975e65b32afc841074bedb8  # tests: Fix sysfs test
      git cherry-pick 5daf75dbed117c75198e104f852fd170cac766de  # utils: add and use opathdir()
      git cherry-pick 2e3a6f5d8240695bd13a7a6da75de895b9407492  # sysfs_fuse: fix indendation
      git cherry-pick 2e13f0be3ad903adf870e7a3032c3356c0c9e4eb  # sysfs_fuse: generate file info for cpu<nr> entries as well
      git cherry-pick 75395f89cf6de91695562a0a47e36acbba879c1e  # meson: bump fuse version
      git cherry-pick 35bf60d4eed08e703180c1f6ec505c7c91442915  # github: add fuse3 tests
      git cherry-pick 006db260b1202761a6661085f7af2fd3cd7083f3  # meson: handle fuse versions with buggy dt_type handling
      git cherry-pick b997ce418919137df7a4552b2b8e0d8207edbaa5  # init/meson: Use libdir instead of hardcoded /lib path
      git cherry-pick a08da8d7e63a023679603a93c18b6bfc443f9471  # Query systemd system unit dir.

So looks like it's something that's been fixed upstream and LXCFS 5.0.1 should then work for you once we release it.

ui-paul-kasemir commented 2 years ago

Okay, I do see that it is fixed already. I bisected the fix to: https://github.com/lxc/lxcfs/commit/2e13f0be3ad903adf870e7a3032c3356c0c9e4eb