canonical / lxd

Powerful system container and virtual machine manager
https://canonical.com/lxd
GNU Affero General Public License v3.0
4.38k stars 931 forks source link

Can't destroy container because LXD/LXC can't remove a file #6998

Closed StyXman closed 4 years ago

StyXman commented 4 years ago

Required information

[root@pb04 cloudian-ansible-baremetal]# lxc info
config:
  core.https_address: '[::]:5555'
  core.trust_password: true
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- rbac
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
  addresses:
  - 10.50.40.104:5555
  - 10.146.2.1:5555
  - 10.37.0.1:5555
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
[...]
    -----END CERTIFICATE-----
  certificate_fingerprint: 8513d033f753717fe5de34a753537c3840546c94d507cbff2595f201ee6c7b52
  driver: lxc
  driver_version: 3.2.1
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    netnsid_getifaddrs: "false"
    seccomp_listener: "false"
    seccomp_listener_continue: "false"
    shiftfs: "false"
    uevent_injection: "false"
    unpriv_fscaps: "true"
  kernel_version: 3.10.0-1062.4.3.el7.x86_64
  lxc_features:
    cgroup2: "false"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    seccomp_notify: "true"
  project: default
  server: lxd
  server_clustered: false
  server_name: pb04
  server_pid: 4000
  server_version: "3.21"
  storage: dir
  storage_version: "1"

Issue description

[root@pb13 ~]# lxc list
+-----------------------------------+---------+------+------+-----------+-----------+
|               NAME                |  STATE  | IPV4 | IPV6 |   TYPE    | SNAPSHOTS |
+-----------------------------------+---------+------+------+-----------+-----------+
| huge-node04-dc1-demo3-cloudian-eu | STOPPED |      |      | CONTAINER | 0         |
+-----------------------------------+---------+------+------+-----------+-----------+

This container contains (!!!) a file that has the append-only attribute set. This attribute has the side effect of not allowing root to remove it. so lxc delete usually fails if you don't unset it with chattr -a. I have Ansible playbooks to handle this, and it usually works... until it doesn't.

[root@pb13 ~]# lxc delete huge-node04-dc1-demo3-cloudian-eu
Error: Failed to remove '/var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu': unlinkat /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log: operation not permitted
[root@pb13 ~]# chattr -a /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log
chattr: No such file or directory while trying to stat /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log

[root@pb13 ~]# namei -lx /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log
f: /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log
Dr-xr-xr-x root root /
drwxr-xr-x root root var
drwxr-xr-x root root snap
drwxr-xr-x root root lxd
drwxr-xr-x root root common
drwx--x--x root root lxd
drwx--x--x root root storage-pools
drwx--x--x root root cloudian
containers - No such file or directory

Steps to reproduce

I'm really not sure how i got here. I got the same 2 weeks ago and I don't remember how I fixed it, sorry.

Information to attach

# [root@pb13 ~]# lxc config show huge-node04-dc1-demo3-cloudian-eu --expanded
architecture: x86_64
config:
  boot.autostart: "1"
  image.architecture: x86_64
  image.description: Centos 7 x86_64 (20190325_07:08)
  image.name: centos-7-x86_64-default-20190325_07:08
  image.os: centos
  image.release: "7"
  image.serial: "20190325_07:08"
  image.variant: default
  limits.cpu: "4"
  limits.memory: 8GB
  raw.lxc: |
    # lxc.aa_profile = lxc-container-default-with-mounting  # TODO
    lxc.cgroup.devices.allow = c 10:137 rwm
    lxc.cgroup.devices.allow = b 7:* rwm
    lxc.cgroup.devices.allow = c 10:237 rwm
  security.privileged: "1"
  user.access_interface: provision
  user.cloudian.installer: "0"
  user.network-config: |+
    version: 1
    config:
    - name: provision
      type: physical
      subnets:
      - type: dhcp

    - name: eth0
      type: physical

    - name: eth1
      type: physical

    - name: ipmi
      type: physical

  user.user-data: |+
    #cloud-config

    runcmd:
    - sed -i 's/session.*required.*pam_loginuid.so/#session\trequired\tpam_loginuid.so/' /etc/pam.d/*
    - sed -i 's/session.*required.*pam_limits.so/#session\trequired\tpam_limits.so/' /etc/pam.d/*

    locale: en_US.UTF-8
    timezone: Europe/Amsterdam

    users:
    - name: root
    [...]

  volatile.base_image: 2d8190b364998ba6edfbcd08509ffce3433f8e84c864a394e9f5c305bacf52f8
  volatile.eth0.hwaddr: 00:16:3e:4a:10:73
  volatile.eth1.hwaddr: 00:16:3e:fd:6a:b9
  volatile.idmap.base: "0"
  volatile.idmap.current: '[]'
  volatile.idmap.next: '[]'
  volatile.ipmi.hwaddr: 00:16:3e:ad:a7:1a
  volatile.last_state.idmap: '[]'
  volatile.last_state.power: STOPPED
  volatile.provision.hwaddr: 00:16:3e:ec:01:5a
devices:
  eth0:
    name: eth0
    nictype: bridged
    parent: testbr0
    type: nic
  eth1:
    name: eth1
    nictype: bridged
    parent: testbr0
    type: nic
  ipmi:
    name: ipmi
    nictype: bridged
    parent: testbr0
    type: nic
  provision:
    name: provision
    nictype: bridged
    parent: testbr0
    type: nic
  root:
    path: /
    pool: cloudian
    type: disk
ephemeral: false
profiles:
- default
stateful: false
description: ""
location: none
metadata:
  context:
    driver: dir
    instance: huge-node04-dc1-demo3-cloudian-eu
    pool: cloudian
    project: default
  level: dbug
  message: DeleteInstance started
timestamp: "2020-03-09T06:41:09.868835768-07:00"
type: logging

location: none
metadata:
  context:
    driver: dir
    instance: huge-node04-dc1-demo3-cloudian-eu
    pool: cloudian
    project: default
    volName: huge-node04-dc1-demo3-cloudian-eu
  level: dbug
  message: Deleting instance volume
timestamp: "2020-03-09T06:41:09.869267335-07:00"
type: logging

location: none
metadata:
  context:
    driver: dir
    instance: huge-node04-dc1-demo3-cloudian-eu
    pool: cloudian
    project: default
  level: dbug
  message: DeleteInstance finished
timestamp: "2020-03-09T06:41:09.870833406-07:00"
type: logging

location: none
metadata:
  context: {}
  level: dbug
  message: 'Failure for task operation: 04a50941-9e9f-459e-99d6-6c26a6d4c21d: Failed
    to remove ''/var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu'':
    unlinkat /var/snap/lxd/common/lxd/storage-pools/cloudian/containers/huge-node04-dc1-demo3-cloudian-eu/rootfs/var/log/hsh/hsh.log:
    operation not permitted'
timestamp: "2020-03-09T06:41:09.870859852-07:00"
type: logging
mdione-cloudian commented 4 years ago

On the same note, maybe I could run cleanup code of the blocking file if there was a pre-stop/pre-delete hook. Should I open a separate issue for that?

stgraber commented 4 years ago

We're pretty allergic to hooks as having those behave consistently in a cluster and be properly integrated with the read-only filesystem of the snap is a pain. Instead we'd rather have LXD do the right thing from the start. In this case, we'll most likely want to use our own walk-based recursive removal logic and when encountering that error, attempt to clear the attribute and try the deletion again.

mdione-cloudian commented 4 years ago

Then you should check which other attributes and alternative methods of limiting capabilities (AppArmour, SELinux? I really don't know) are available that could lead to these kind of issues.

stgraber commented 4 years ago

LXD itself normally runs unconfined at the LSM layer and its containers can't load policies to prevent themselves from getting deleted. So I can only think of the usual suspects:

stgraber commented 4 years ago

Confirmed that as expected, this can only be a problem inside a privileged container, so we don't need to treat this as a security/DoS at least.

mdione-cloudian commented 4 years ago

I can confirm that 3.23 fixes this. You can close this.