lxc / lxc

LXC - Linux Containers
https://linuxcontainers.org/lxc
Other
4.61k stars 1.12k forks source link

liblxc with io_uring keeps a core in CPU wait state #4364

Open elburb opened 10 months ago

elburb commented 10 months ago

Required information

root@in-slug:~# incus info
config: {}
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- network_sriov
- console
- restrict_dev_incus
- migration_pre_copy
- infiniband
- dev_incus_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- dev_incus_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- backup_compression
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- snapshot_schedule_aliases
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
- network_state_vlan
- instance_nic_bridged_port_isolation
- instance_bulk_state_change
- network_gvrp
- instance_pool_move
- gpu_sriov
- pci_device_type
- storage_volume_state
- network_acl
- migration_stateful
- disk_state_quota
- storage_ceph_features
- projects_compression
- projects_images_remote_cache_expiry
- certificate_project
- network_ovn_acl
- projects_images_auto_update
- projects_restricted_cluster_target
- images_default_architecture
- network_ovn_acl_defaults
- gpu_mig
- project_usage
- network_bridge_acl
- warnings
- projects_restricted_backups_and_snapshots
- clustering_join_token
- clustering_description
- server_trusted_proxy
- clustering_update_cert
- storage_api_project
- server_instance_driver_operational
- server_supported_storage_drivers
- event_lifecycle_requestor_address
- resources_gpu_usb
- clustering_evacuation
- network_ovn_nat_address
- network_bgp
- network_forward
- custom_volume_refresh
- network_counters_errors_dropped
- metrics
- image_source_project
- clustering_config
- network_peer
- linux_sysctl
- network_dns
- ovn_nic_acceleration
- certificate_self_renewal
- instance_project_move
- storage_volume_project_move
- cloud_init
- network_dns_nat
- database_leader
- instance_all_projects
- clustering_groups
- ceph_rbd_du
- instance_get_full
- qemu_metrics
- gpu_mig_uuid
- event_project
- clustering_evacuation_live
- instance_allow_inconsistent_copy
- network_state_ovn
- storage_volume_api_filtering
- image_restrictions
- storage_zfs_export
- network_dns_records
- storage_zfs_reserve_space
- network_acl_log
- storage_zfs_blocksize
- metrics_cpu_seconds
- instance_snapshot_never
- certificate_token
- instance_nic_routed_neighbor_probe
- event_hub
- agent_nic_config
- projects_restricted_intercept
- metrics_authentication
- images_target_project
- cluster_migration_inconsistent_copy
- cluster_ovn_chassis
- container_syscall_intercept_sched_setscheduler
- storage_lvm_thinpool_metadata_size
- storage_volume_state_total
- instance_file_head
- instances_nic_host_name
- image_copy_profile
- container_syscall_intercept_sysinfo
- clustering_evacuation_mode
- resources_pci_vpd
- qemu_raw_conf
- storage_cephfs_fscache
- network_load_balancer
- vsock_api
- instance_ready_state
- network_bgp_holdtime
- storage_volumes_all_projects
- metrics_memory_oom_total
- storage_buckets
- storage_buckets_create_credentials
- metrics_cpu_effective_total
- projects_networks_restricted_access
- storage_buckets_local
- loki
- acme
- internal_metrics
- cluster_join_token_expiry
- remote_token_expiry
- init_preseed
- storage_volumes_created_at
- cpu_hotplug
- projects_networks_zones
- network_txqueuelen
- cluster_member_state
- instances_placement_scriptlet
- storage_pool_source_wipe
- zfs_block_mode
- instance_generation_id
- disk_io_cache
- amd_sev
- storage_pool_loop_resize
- migration_vm_live
- ovn_nic_nesting
- oidc
- network_ovn_l3only
- ovn_nic_acceleration_vdpa
- cluster_healing
- instances_state_total
- auth_user
- security_csm
- instances_rebuild
- numa_cpu_placement
- custom_volume_iso
- network_allocations
- zfs_delegate
- storage_api_remote_volume_snapshot_copy
- operations_get_query_all_projects
- metadata_configuration
- syslog_socket
- event_lifecycle_name_and_project
- instances_nic_limits_priority
- disk_initial_volume_configuration
- operation_wait
- image_restriction_privileged
- cluster_internal_custom_volume_copy
- disk_io_bus
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
auth_user_name: root
auth_user_method: unix
environment:
  addresses: []
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
    MIICATCCAYegAwIBAgIRAO8I7hQbb6XMlgl3DQyNf4wwCgYIKoZIzj0EAwMwMjEZ
    MBcGA1UEChMQTGludXggQ29udGFpbmVyczEVMBMGA1UEAwwMcm9vdEBpbi1zbHVn
    MB4XDTIzMTEwNDA5MDgzM1oXDTMzMTEwMTA5MDgzM1owMjEZMBcGA1UEChMQTGlu
    dXggQ29udGFpbmVyczEVMBMGA1UEAwwMcm9vdEBpbi1zbHVnMHYwEAYHKoZIzj0C
    AQYFK4EEACIDYgAE228mdo0GAEULOY+B3basecH7bFBXSTCFUWTJvnbjkjZWn4U1
    3nJTapjsfbwfVakyJcb4ybLbGgTsNxC8r5ShVDpDU5oDzxfsJMpYryQT2jp/eQpg
    AOvO4g0CNngzW0I5o2EwXzAOBgNVHQ8BAf8EBAMCBaAwEwYDVR0lBAwwCgYIKwYB
    BQUHAwEwDAYDVR0TAQH/BAIwADAqBgNVHREEIzAhggdpbi1zbHVnhwR/AAABhxAA
    AAAAAAAAAAAAAAAAAAABMAoGCCqGSM49BAMDA2gAMGUCMEKkRgMjduyrC6KnZeng
    CKl8WTdHCADKDOaITC6pGLVYVTz6RZC5iyDcCsfcVGVwUAIxAIS/LkzgUK5jo3MK
    msBJ7XUcVHcEUqsa8CRMsEeYHL8iI9KPJEgoiUwGibzTsMT93w==
    -----END CERTIFICATE-----
  certificate_fingerprint: df7c40166f9db4aaa2b6f9d2c4ba755c5a9778de54dc12ac1476b70edb78dcd9
  driver: lxc | qemu
  driver_version: 5.0.3 | 8.1.2
  firewall: nftables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    idmapped_mounts: "true"
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    uevent_injection: "true"
    unpriv_fscaps: "true"
  kernel_version: 5.15.0-88-generic
  lxc_features:
    cgroup2: "true"
    core_scheduling: "true"
    devpts_fd: "true"
    idmapped_mounts_v2: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_allow_deny_syntax: "true"
    seccomp_notify: "true"
    seccomp_proxy_send_notify_fd: "true"
  os_name: Ubuntu
  os_version: "22.04"
  project: default
  server: incus
  server_clustered: false
  server_event_mode: full-mesh
  server_name: in-slug
  server_pid: 460
  server_version: "0.2"
  storage: dir
  storage_version: "1"
  storage_supported_drivers:
  - name: dir
    version: "1"
    remote: false

Issue description

Starting a container using incus 0.2 via the zabbly stable .deb packages creates high cpu wait for each container started. Unfortunately I was unable to track down exactly which process was causing the high wait.

I'm able to reproduce this in VM and bare metal environments, with both ubuntu jammy and debian bookworm host OS, and arbitrary guests. The high cpu wait does not occur when launching a VM.

Steps to reproduce

Create a fresh VM for testing and install curl dependency:

$ lxc launch --vm images:ubuntu/jammy/amd64 -c limits.cpu=2 -c limits.memory=8GB
Creating the instance
Instance name is: in-slug                     
Starting in-slug

$ lxc shell in-slug
root@in-slug:~# apt update
...
root@in-slug:~# apt install curl
...

Install incus via zabbly packages

root@in-slug:~# mkdir -p /etc/apt/keyrings/
root@in-slug:~# curl -fsSL https://pkgs.zabbly.com/key.asc -o /etc/apt/keyrings/zabbly.asc
root@in-slug:~# sh -c 'cat <<EOF > /etc/apt/sources.list.d/zabbly-incus-stable.sources
Enabled: yes
Types: deb
URIs: https://pkgs.zabbly.com/incus/stable
Suites: $(. /etc/os-release && echo ${VERSION_CODENAME})
Components: main
Architectures: $(dpkg --print-architecture)
Signed-By: /etc/apt/keyrings/zabbly.asc

EOF'
root@in-slug:~# apt update
...
root@in-slug:~# apt install incus
...

root@in-slug:~# incus admin init
If this is your first time running Incus on this machine, you should also run: incus admin init

Would you like to use clustering? (yes/no) [default=no]: 
Do you want to configure a new storage pool? (yes/no) [default=yes]: 
Name of the new storage pool [default=default]: 
Would you like to create a new local network bridge? (yes/no) [default=yes]: no
Would you like to use an existing bridge or host interface? (yes/no) [default=no]: 
Would you like the server to be available over the network? (yes/no) [default=no]: 
Would you like stale cached images to be updated automatically? (yes/no) [default=yes]: 
Would you like a YAML "init" preseed to be printed? (yes/no) [default=no]: 

Without any containers running cpu wait is 0.0

root@in-slug:~# top

top - 09:09:09 up 4 min,  0 users,  load average: 0.25, 0.15, 0.05
Tasks: 129 total,   1 running, 128 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.0 us,  3.1 sy,  0.0 ni, 96.9 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7373.6 total,   6384.9 free,    193.4 used,    795.3 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6922.9 avail Mem 

Start a new container

root@in-slug:~# incus launch images:ubuntu/jammy/amd64
Creating the instance
Instance name is: polite-chigger              

The instance you are starting doesn't have any network attached to it.
  To create a new network, use: incus network create
  To attach a network to an instance, use: incus network attach

Starting polite-chigger

cpu wait is now 50% (= 1 core)

root@in-slug:~# top

top - 09:10:46 up 5 min,  0 users,  load average: 0.62, 0.26, 0.10
Tasks: 135 total,   1 running, 134 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.0 us,  0.2 sy,  0.0 ni, 49.6 id, 50.3 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7373.6 total,   5634.3 free,    241.3 used,   1498.0 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6848.4 avail Mem 

Start another container, cpu wait goes to 100%

root@in-slug:~# incus launch images:debian/bookworm/amd64
Creating the instance
Instance name is: wired-gobbler               

The instance you are starting doesn't have any network attached to it.
  To create a new network, use: incus network create
  To attach a network to an instance, use: incus network attach

Starting wired-gobbler
root@in-slug:~# top

top - 09:12:34 up 7 min,  0 users,  load average: 0.88, 0.41, 0.16
Tasks: 167 total,   1 running, 166 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.2 us,  0.3 sy,  0.0 ni,  0.0 id, 99.5 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7373.6 total,   3810.1 free,    352.6 used,   3211.0 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6702.9 avail Mem 

Stopping the containers returns wait to 0

root@in-slug:~# incus stop --all
root@in-slug:~# top

top - 09:14:53 up 9 min,  0 users,  load average: 0.24, 0.29, 0.15
Tasks: 128 total,   1 running, 127 sleeping,   0 stopped,   0 zombie
%Cpu(s):  3.0 us,  3.0 sy,  0.0 ni, 93.9 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :   7373.6 total,   3934.8 free,    238.0 used,   3200.8 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.   6817.9 avail Mem 

Information to attach

stgraber commented 10 months ago

Reproduced this behavior here. So far I've confirmed that it doesn't appear to be caused by any one specific process in the container and it's not caused by incusd itself either.

I suspect it may be some issue with liblxc and io_uring, I'm doing a build of the incus daily package without io_uring now to test that.

stgraber commented 10 months ago

Confirmed that it was the issue, I've disabled io_uring in liblxc for now in our builds and I'll re-assign this bug to LXC.

stgraber commented 10 months ago

@brauner @mihalicyn

elburb commented 10 months ago

I can confirm that I no longer see the issue with the latest daily build (0.2-202311050411).

Thanks for all the hard work on incus, it's very much appreciated. For info, I currently have a two node test cluster with incus running on physical servers with bookworm and zfs. The cluster host a bunch of normal and nested containers, VMs and a mix of different mounts and networking. Other than this little wrinkle everything has so far worked fine and it's been a really smooth experience.

mihalicyn commented 10 months ago

I guess that Stéphane has tested this with the recent kernel so, it should not be any well-known io_uring bug which was fixed.

I'll take a look on that. @stgraber, please assign this to me ;-)

mihalicyn commented 10 months ago

@sesa-me thanks a lot for so well-detailed report!