canonical / lxd

Powerful system container and virtual machine manager
https://canonical.com/lxd
GNU Affero General Public License v3.0
4.33k stars 927 forks source link

lxc launch/copy causes ssh connections to other containers/VMs to drop #7501

Closed sergiodj closed 4 years ago

sergiodj commented 4 years ago

Required information

config: {}
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- rbac
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
  addresses: []
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
    MIICBDCCAYmgAwIBAgIQU7VKkGmJVt+IESout3IkrDAKBggqhkjOPQQDAzA0MRww
    GgYDVQQKExNsaW51eGNvbnRhaW5lcnMub3JnMRQwEgYDVQQDDAtyb290QHBzaXF1
    ZTAeFw0yMDA0MjAxOTQ5MTBaFw0zMDA0MTgxOTQ5MTBaMDQxHDAaBgNVBAoTE2xp
    bnV4Y29udGFpbmVycy5vcmcxFDASBgNVBAMMC3Jvb3RAcHNpcXVlMHYwEAYHKoZI
    zj0CAQYFK4EEACIDYgAE586++wS8PbwSoWR6DddJ9K8njagWtKxBi2GYPCCPL6Lb
    mjEz83e+TKrraONb60yDUKeLHjwVpLld8anQZLRFqF2rxM9RnC83qOtadAtBVz16
    Z/4O8UxyTDERNrXsOTw7o2AwXjAOBgNVHQ8BAf8EBAMCBaAwEwYDVR0lBAwwCgYI
    KwYBBQUHAwEwDAYDVR0TAQH/BAIwADApBgNVHREEIjAgggZwc2lxdWWHBH8AAAGH
    EAAAAAAAAAAAAAAAAAAAAAEwCgYIKoZIzj0EAwMDaQAwZgIxAPNmlIPZagjhr1w4
    dOnobuW/q9/N8U5mS8wbrh+jLTKTqV6IMI2IX6TbkIntmo3LQAIxAPo8r9KDSiq6
    hXZNPkMOC2zJD3IfxmabGMI8bWvjhxiN9rZ4NXNBvbHFANp8hLkf3Q==
    -----END CERTIFICATE-----
  certificate_fingerprint: 35c4fab6a3999c653bc13ff38c764f3b98573d6f444ccb196934f43a7a93c9eb
  driver: lxc
  driver_version: 4.0.2
  firewall: xtables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    shiftfs: "false"
    uevent_injection: "true"
    unpriv_fscaps: "true"
  kernel_version: 5.4.0-31-generic
  lxc_features:
    cgroup2: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_notify: "true"
  os_name: Ubuntu
  os_version: "20.04"
  project: default
  server: lxd
  server_clustered: false
  server_name: psique
  server_pid: 2091969
  server_version: "4.1"
  storage: zfs
  storage_version: 0.8.3-1ubuntu12

Issue description

When I'm connected to one of my containers/VMs via ssh and I perform a lxc launch or a lxc copy & lxc start, I notice that the ssh connections get closed with Connection to 10.101.133.21 closed by remote host. or client_loop: send disconnect: Broken pipe.

The way I work is to have a few "template" containers and VMs, and the copying them over to temporary instances, which means that I'm constantly facing this issue.

Steps to reproduce

  1. ssh into an existing container/VM
  2. Perform a lxc launch or a lxc copy & lxc start.

Information to attach

Not sure this is relevant, but here it is:

[1537367.231215] lxdbr0: port 6(veth8f7be573) entered blocking state
[1537367.231217] lxdbr0: port 6(veth8f7be573) entered disabled state
[1537367.231316] device veth8f7be573 entered promiscuous mode
[1537367.231354] lxdbr0: port 6(veth8f7be573) entered blocking state
[1537367.231355] lxdbr0: port 6(veth8f7be573) entered forwarding state
[1537367.254789] lxdbr0: port 6(veth8f7be573) entered disabled state
[1537367.448524] eth0: renamed from vethaf52ccb7
[1537367.490134] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[1537367.490185] lxdbr0: port 6(veth8f7be573) entered blocking state
[1537367.490187] lxdbr0: port 6(veth8f7be573) entered forwarding state
Name: test-bla
Location: none
Remote: unix://
Architecture: x86_64
Created: 2020/06/08 20:50 UTC
Status: Running
Type: container
Profiles: ubuntu
Pid: 3366044
Ips:
  eth0: inet    10.101.133.177  veth8f7be573
  eth0: inet6   fd42:7dc5:359f:4d1f:216:3eff:fe96:688a  veth8f7be573
  eth0: inet6   fe80::216:3eff:fe96:688a        veth8f7be573
  lo:   inet    127.0.0.1
  lo:   inet6   ::1
Resources:
  Processes: 15
  Disk usage:
    root: 48.66MB
  CPU usage:
    CPU usage (in seconds): 4
  Memory usage:
    Memory (current): 63.01MB
    Memory (peak): 92.23MB
  Network usage:
    eth0:
      Bytes received: 8.24kB
      Bytes sent: 3.18kB
      Packets received: 85
      Packets sent: 31
    lo:
      Bytes received: 0B
      Bytes sent: 0B
      Packets received: 0
      Packets sent: 0

Log:

lxc test-bla 20200608205058.451 WARN     cgfsng - cgroups/cgfsng.c:mkdir_eexist_on_last:1152 - File exists - Failed to create directory "/sys/fs/cgroup/cpuset//lxc.monitor.test-bla"
lxc test-bla 20200608205058.453 WARN     cgfsng - cgroups/cgfsng.c:mkdir_eexist_on_last:1152 - File exists - Failed to create directory "/sys/fs/cgroup/cpuset//lxc.payload.test-bla"

This is important. I use a very customized profile.

architecture: x86_64
config:
  boot.autostart: "false"
  image.architecture: amd64
  image.description: Ubuntu groovy amd64 (20200608_07:42)
  image.os: Ubuntu
  image.release: groovy
  image.serial: "20200608_07:42"
  image.type: squashfs
  raw.lxc: |-
    lxc.apparmor.profile = unconfined
    lxc.cgroup.devices.allow = a
    lxc.cap.drop =
    lxc.mount.auto = cgroup:rw proc:rw sys:ro
  security.nesting: "true"
  security.privileged: "true"
  user.user-data: "#cloud-config\nusers:\n  - name: sergio\n    ssh_authorized_keys:\n      - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAEAQsNHqfP60KNYEli9u/2PnSjRJ6IZQH2SFiszL07W75/tkF6Os/YmjyEk5Ns2dnr2fBpv+iJbFmDmBVdgbHeDFK3xgoI6o/VTemDcdHwObOKrJ0KS5oGnerPfDTQvuA/DySR2MYuAU7QkVpNVThJmvVnyuIZBvOnDWv9Lt16sN8viSn9p063fnodCdtEfH6KLmpLkaAU6NQw2ZycOPW65Qc+PSbD7XuvzatkbkoDT9Z3lTKEHOzd41e1m/736hIA8GSwDa7Pf0I0u0zgL+crfRks1C8MzpYPa97wMm0N98e8mrciHU6mR6L9wF/D0LLJBGlzP4P838oC++rz3NOx2ZrGdDmY3AjII7hlOd48D4+fwcoZHaUC3p3Byzieddfq7g3ZwNyseIZXNJFZLb9h59fiorBtqM1eQW3GiA7K5j5WhZViwd3m78q9AUxXEFq1zg4prkXjeDIxPH/h/fS41yF7fJjnjkvROVyc4grKuLP6+D/oTzc1UNdJBzbRIifJPFc8avXuRhTGxTEU1SYWgld5glnWg8IQe/x0E+m83XatvHvGteUPbrfIcIiWyeERlXt87MClWh2rO9vpQHdXf5RW0kZXO1vKYXNK/s6bXkQFAX18qAUHcAeRaxGDtzykm/JjQyITWX+uP8pdA7dYIhFb9HQqtfZxyTXdjneVGjkYEwfYJE6XVy7soDq1BcuZx4wQWtUGn0bfoTeOJ53/NeN4NLLwPhdNUziO/H7eQs06vIggNnuRSh793HOx+0bF3JVqoI4hw/sXe9c2CVnagja+wkJQCoGDwtX/3I2qDznL76WuuOxtOezhOILW/cwqXOzaF43MgG2jW0B8ThSiyiNeTlRSC7l5Dqf9sR1KskNal+jTdFF9YTMarWDnfkHSMva8imMXBRk5H0BiZdrDXTCo2Opf92u2mGUOluovKoDST+7780SGHMwaObc80i7ovG9tf3hP4K7JfdKmxnkkqxCSVs4jYyqOVngtr7q1tX3xFrF/u7DLNJceNGWqmzu7Nx1kWBg1UAxHfi/ASQucSHWfZ5FtVMucywlpoOx+s++5uzUkpM4wcSNU9GgcM8jvJGR0KqV0z/fZ26hJKn3i2mWLYcVA6tkuvm4e1r5LaDUQejlvmox9+UQDrhNXld/Pc/Xd8fnGRtV5OjbJM1FzsqASofjm0S361f8QKmqmFalY1Y+6+EVFh2vNhUlkrs2nmx2l782WczyH7yB9LN5iixKsI3mUCuvalPtdk/1nQWrk/0oZ0bk4qYspcyiPv7SkLJ/NbH3cIz41OdG9FO2+s+HdXSVlf9BwLEWaa4sMkPLEOoQI0VytcAGjQr4Muw3+E2iirMGNtkTpCbnkm7dJNuhN sergio@psique\n    sudo: ['ALL=(ALL) NOPASSWD:ALL']\n    groups: [root,sudo,staff]\n    shell: /bin/bash\ndisable_root: false\nruncmd:\n  - snap install --classic --edge git-ubuntu\n  - systemctl stop unattended-upgrades\n  - passwd -d root\n  - passwd -d sergio\n  - echo \"debconf debconf/priority select low\" | sudo debconf-set-selections\n  - DEBIAN_FRONTEND=noninteractive dpkg-reconfigure debconf\n  - DEBIAN_FRONTEND=noninteractive apt-get update -y\n  - DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y\n  - DEBIAN_FRONTEND=noninteractive apt-get autoremove -y\n  - DEBIAN_FRONTEND=noninteractive apt-get autoclean -y\n  - systemctl disable unattended-upgrades\n  - rm -rf /root/.bashrc /root/.gnupg /root/.profile /root/.ssh/\n  - cp -rfp /home/sergio/.ssh /root/.ssh\n  - chown -R root:root /root/.ssh\n  - ln -s /home/sergio/.bashrc /root/.bashrc\n  - ln -s /home/sergio/.my_alias /root/.my_alias\n  - ln -s /home/sergio/.vimrc /root/.vimrc\n  - ln -s /home/sergio/.viminfo /root/.viminfo\n  - ln -s /home/sergio/.vim /root/.vim\n  - ln -s /home/sergio/.alias /root/.alias\n  - ln -s /home/sergio/.bash_profile /root/.bash_profile\n  - ln -s /home/sergio/.profile /root/.profile\n  - ln -s /home/sergio/.gnupg /root/.gnupg\n  - ln -s /home/sergio/.emacs.d/ /root/.emacs.d\n  - ln -s /home/sergio/emacs-lisp /root/emacs-lisp\n  - ssh-keygen -A\npackages:\n  - man\n  - manpages\n  - locales\n  - less\n  - vim\n  - jq\n  - uuid\n  - bash-completion\n  - sudo\n  - rsync\n  - ncurses-term\n  - iputils-arping\n  - iputils-ping\n  - iputils-tracepath\n  - traceroute\n  - mtr-tiny\n  - tcpdump\n  - dnsutils\n  - ssh-import-id\n  - openssh-server\n  - openssh-client\n  - build-essential\n  - devscripts\n  - git-buildpackage\n  - ubuntu-dev-tools\n  - linux-headers-generic\n  - gdb\n  - strace\n  - ltrace\n  - lsof\n  - xterm\n  - vim-nox\n  - emacs-nox\n  - elpa-debian-el\n  - xcscope-el\npower_state:\n  delay: 'now'\n  mode: reboot\n  timeout: 10\n  condition: True\nfinal_message: \"The system is finally up! Enjoy!\"\nwrite_files:\n  - path: /etc/ssh/sshd_config\n    content: |\n      Port 22\n      AddressFamily any\n      SyslogFacility AUTH\n      LogLevel INFO\n      PermitRootLogin yes\n      PubkeyAuthentication yes\n      PasswordAuthentication yes\n      ChallengeResponseAuthentication no\n      GSSAPIAuthentication no\n      HostbasedAuthentication no\n      PermitEmptyPasswords no\n      UsePAM yes\n      IgnoreUserKnownHosts yes\n      IgnoreRhosts yes\n      X11Forwarding yes\n      X11DisplayOffset 10\n      X11UseLocalhost yes\n      PermitTTY yes\n      PrintMotd no\n      TCPKeepAlive yes\n      PermitTunnel yes\n      ClientAliveInterval 5\n      Banner none\n      AcceptEnv LANG LC_* EDITOR PAGER SYSTEMD_EDITOR\n      Subsystem\tsftp /usr/lib/openssh/sftp-server\n  - path: /etc/ssh/ssh_config\n    content: |\n      Host *\n        ForwardAgent no\n        ForwardX11 no\n        PasswordAuthentication yes\n        CheckHostIP no\n        AddressFamily any\n        SendEnv LANG LC_* EDITOR PAGER\n        StrictHostKeyChecking no\n        HashKnownHosts yes\n        ServerAliveInterval 5\n  - path: /etc/sudoers\n    content: |\n        Defaults env_keep += \"LANG LANGUAGE LINGUAS LC_* _XKB_CHARSET\"\n        Defaults env_keep += \"HOME EDITOR SYSTEMD_EDITOR PAGER\"\n        Defaults env_keep += \"XMODIFIERS GTK_IM_MODULE QT_IM_MODULE QT_IM_SWITCHER\"\n        Defaults secure_path=\"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"\n        Defaults logfile=/var/log/sudo.log,loglinelen=0\n        Defaults !syslog, !pam_session\n        root ALL=(ALL) NOPASSWD: ALL\n        %wheel ALL=(ALL) NOPASSWD: ALL\n        %sudo ALL=(ALL) NOPASSWD: ALL\n        sergio ALL=(ALL) NOPASSWD: ALL\napt:\n  proxy: \"http://10.101.133.1:3142/\"\n  http_proxy: \"http://10.101.133.1:3142/\"\n  ftp_proxy: \"http://10.101.133.1:3142/\"\n  https_proxy: \"http://10.101.133.1:3142/\"\n  preserve_sources_list: false\n  primary:\n    - arches: [default]\n      uri: \"http://ca.archive.ubuntu.com/ubuntu\"\n  sources_list: |\n    deb $MIRROR $RELEASE main restricted universe multiverse\n    deb $MIRROR $RELEASE-updates main restricted universe multiverse\n    deb $MIRROR $RELEASE-proposed main restricted universe multiverse\n    # deb $MIRROR $RELEASE-backports main restricted universe multiverse\n    # deb $SECURITY $RELEASE-security universe multiverse\n    deb-src $MIRROR $RELEASE main restricted universe multiverse\n    deb-src $MIRROR $RELEASE-updates main restricted universe multiverse\n    deb-src $MIRROR $RELEASE-proposed main restricted universe multiverse\n    # deb-src $MIRROR $RELEASE-backports main restricted universe multiverse\n    # deb-src $SECURITY $RELEASE-security universe multiverse\n  conf: |\n    Acquire::http::Proxy \"http://10.101.133.1:3142/\";\n    Dpkg::Options {\n      \"--force-confdef\";\n      \"--force-confold\";\n    };\n  sources:\n    debug.list:\n      source: |\n        # debug symbols\n        # deb http://ddebs.ubuntu.com $RELEASE main restricted universe multiverse\n        # deb http://ddebs.ubuntu.com $RELEASE-updates main restricted universe multiverse\n        # deb http://ddebs.ubuntu.com $RELEASE-proposed main restricted universe multiverse\n      keyid: C8CAB6595FDFF622\n"
  volatile.base_image: a7b85f2d8c43ec47c9bd181d4ef58e2de199b55c89ed89174464c10c1002b078
  volatile.idmap.base: "0"
  volatile.idmap.current: '[]'
  volatile.idmap.next: '[]'
  volatile.last_state.idmap: '[]'
  volatile.last_state.power: RUNNING
  volatile.lxdbr0.host_name: veth8f7be573
  volatile.lxdbr0.hwaddr: 00:16:3e:96:68:8a
  volatile.lxdbr0.name: eth0
devices:
  homedir:
    path: /home
    source: /home
    type: disk
  lxdbr0:
    nictype: bridged
    parent: lxdbr0
    type: nic
  root:
    path: /
    pool: default
    type: disk
ephemeral: false
profiles:
- ubuntu
stateful: false
description: ""
t=2020-06-08T16:50:41-0400 lvl=info msg="Created container" ephemeral=false name=test-bla project=default
t=2020-06-08T16:50:45-0400 lvl=info msg="Starting container" action=start created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=1969-12-31T19:00:00-0500
t=2020-06-08T16:50:45-0400 lvl=info msg="Started container" action=start created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=1969-12-31T19:00:00-0500
t=2020-06-08T16:50:58-0400 lvl=info msg="Container initiated reboot" action=reboot created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=2020-06-08T16:50:45-0400
t=2020-06-08T16:50:58-0400 lvl=info msg="Shut down container" action=reboot created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=2020-06-08T16:50:45-0400
t=2020-06-08T16:50:58-0400 lvl=info msg="Starting container" action=start created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=2020-06-08T16:50:45-0400
t=2020-06-08T16:50:58-0400 lvl=info msg="Started container" action=start created=2020-06-08T16:50:41-0400 ephemeral=false name=test-bla project=default stateful=false used=2020-06-08T16:50:45-0400
stgraber commented 4 years ago

Setting a fixed mac address on your bridge usually helps with that.

The Linux kernel somewhat annoyingly picks the highest mac address on a bridge as the bridge's own address.

LXD doesn't do any network changes on instance startup so that's the most likely issue and affects people differently mostly based on luck.

Setting bridge.hwaddr to some fixed value should do the trick.

sergiodj commented 4 years ago

Thanks, it seemed to work indeed.