laralar commented 6 years ago

Required information

Distribution: ubuntu
Distribution version: 18.04.1

The output of "lxc info" or if that fails:

root@node33:~# lxc info
config:
core.https_address: '[::]:8443'
core.trust_password: true
maas.api.key: rVFh7j9tUQ5x8aZ8NK:J9fETgjvLG5behn9br:D8vxhRFbGRx6zw4xtHMrXA77JL3gHxH9
maas.api.url: http://10.3.4.10:5240/MAAS
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- macaroon_authentication
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- candid_authentication
- backup_compression
- candid_config
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- candid_config_key
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
environment:
addresses:
- 10.3.4.33:8443
- 10.3.5.233:8443
- 192.168.122.1:8443
architectures:
- x86_64
- i686
certificate: |
-----BEGIN CERTIFICATE-----
MIIFRzCCAy+gAwIBAgIRALyIp8eIXIP4p1Oc/AOIV9kwDQYJKoZIhvcNAQELBQAw
NDEcMBoGA1UEChMTbGludXhjb250YWluZXJzLm9yZzEUMBIGA1UEAwwLcm9vdEBu
b2RlMzMwHhcNMTgwNjEzMDQxMDQ0WhcNMjgwNjEwMDQxMDQ0WjA0MRwwGgYDVQQK
ExNsaW51eGNvbnRhaW5lcnMub3JnMRQwEgYDVQQDDAtyb290QG5vZGUzMzCCAiIw
DQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBANJRN2ESwgLOUMyMKJWsL62b1LtE
l+xczFN+RBQ2ctk0a08PXWbdbOSGfV9EgxyVlf/0MRl3Rrey/xvtc4KQix++n7b+
g6yYo6HNq4WV2S+BmNtpe293UKTqzizb0Ie9ynalnACsm027hXDsNjaRzbGLR+z9
gxqFj4tR9gLf3/NTSdXrFRpHTdpciIHIdwDgphkEgSVImX1rJvaaI9ODtWjAWxSq
UYslnLKW7nXXM5ep/Tf/VzLH+4BhNnvccqCLIxes8yJbUczbxe/wLPiEK+aewB/7
J/f0NfrbDWlfjWCu9BjxoFwjSznlLxC+G4BARh/7gxJAIDaDI9Xa1SX+tPCpU14q
tQqVcb48D/IBF+b5lVojCgXBAWGEJyvRJxY5dfqG2y4WRm3fHg+6szAtve6FEAhI
FAufpxF8u/esgpcITUZO134/tfrmFeMy8l+dE+3tD4AOWs46SsjEO8gsyPtj/3a3
O5gTffzmucKKmEDHd+3Q2UE09HYhgMtW2+QL6uHFU1WOnalR1I5KFqiEqUcs3VvG
R98JdLWzd2r8Q38ROVtlllL2TI+/f5QE8YsF2pVKjd8gcTmeUuEEzYf+eMyoeJuN
DLwSaxwyujpORsRqOlcLTAZ1hxkKcSkbj03GzjC/BvilAh7uXNjdCunagb7QMI55
DL3VKLJWJ0qAkWKDAgMBAAGjVDBSMA4GA1UdDwEB/wQEAwIFoDATBgNVHSUEDDAK
BggrBgEFBQcDATAMBgNVHRMBAf8EAjAAMB0GA1UdEQQWMBSCBm5vZGUzM4cECgME
IYcEwKh6ATANBgkqhkiG9w0BAQsFAAOCAgEAc3c4D6DxnUqmQdRm8H3S0gdJFU/5
JSDZiCobelXQ/YrAc9ODlbe7aI+5HZ0/U/JqNyApjXAuxafmT2T8YClopFovK5TG
Os8SpxDFEK2yYzq+XAt1Q4IXXbD2Ou1dvGgLnILmjhcCnYttXKFmAL+7yQLQA59G
usZtY6YVnLP9yZuPbb/9oKmqTn7mQcEMwHOPh+YLvMmcLlHACuZ3bRuPqZ1uqv2/
TWLXNhEjjLjZFoJBSCXU6/EoNtjHrVHcolE2ABTgexF8MiA+spLt5/2JxQuHRER1
sMPH+SpOPtMrWEQZCupr6wsJwV6CutJKJee6I9DmJvVvmR6vBVjd8QbWTcHiZPvD
Nb0h4irA0iWJ2rco7bENU8eVKhNt7mj60pTzIJz9bsUPUEkxRjVFPCzGqa4KZt1n
ohadH2PFUe7QRqz/mmT52EQ7nvG/Gd5e15cWQW1Mv7wTa8vmpO1gRLbQMz6BMP7b
kD87x7+6WepnrC/yK5tMMrogMkc+XnLtSEC7jF09rvlQlM0iQrFpXm2LkVqQEgFk
fxafvvBhD8PkwkIi0MtOohdwqYvXPC8jYWXpZguLdwUwd2SF5kgXmh/1sAXbV0aB
sDgRUry95PKDkxs3wIeycw8RQWy3AQ9gnQMnoeXKDGrd+auRJlhESyTcyqrg07o2
v+MMAXOAocNFz50=
-----END CERTIFICATE-----
certificate_fingerprint: 38779b656fd9a6945b75c68c15e02f9bcd005308c9eb1d5626f9aad4e3b1cfaa
driver: lxc
driver_version: 3.0.2
kernel: Linux
kernel_architecture: x86_64
kernel_version: 4.15.0-36-generic
server: lxd
server_pid: 4159
server_version: "3.6"
storage: zfs | btrfs
storage_version: 0.7.5-1ubuntu16.3 | 4.4
server_clustered: false
server_name: node33
project: default
root@node33:~#

Issue description

I have a few hosts as LXD hosts, around 40 total

I have been updating them to the latest upgrades regularly, but since I upgrded them to Ubuntu 18.04, from time to time the server gets stucked, I have to login to the remote console using iLO/iDRAC and reboot the server since it doesn't respond to even keystrokes to be able to do a console login

On the console there are no messages

on /var/log/syslog there is no relevant message, except the last one before getting stucked and the first one after reboot

For example:


Oct 22 17:17:01 node33 CRON[21497]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
Oct 22 18:17:01 node33 CRON[5452]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
Oct 22 18:33:39 node33 systemd[1]: Starting Daily apt download activities...
Oct 22 18:33:40 node33 systemd[1]: Started Daily apt download activities.
Oct 22 19:17:01 node33 CRON[48541]: (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
Oct 22 19:28:34 node33 snapd[42014]: storehelpers.go:398: cannot refresh: snap has no updates available: "core", "lxd"
Oct 22 19:28:34 node33 snapd[42014]: autorefresh.go:387: auto-refresh: all snaps are up-to-date
Oct 23 09:58:22 node33 systemd-modules-load[793]: Inserted module 'iscsi_tcp'
Oct 23 09:58:22 node33 systemd-modules-load[793]: Inserted module 'ib_iser'
Oct 23 09:58:22 node33 systemd[1]: Started Create list of required static device nodes for the current kernel.
Oct 23 09:58:22 node33 systemd[1]: Started Load Kernel Modules.
Oct 23 09:58:22 node33 systemd[1]: Started LVM2 metadata daemon.

There is nothing else installed or running on the hosts except LXD and a few containers in each hosts. It happens randomly (so far I haven't been able to identify the cause) and if I am not mistaen it started happening when I upgraded from 16.04 to 18.04

At first I thought there was some kernel issue and there were some messages in the console regarding some clues about this. But now there is nothing.

What else could I try to see in which logs to try to get a clue of what is happening? I am not very used to explore system/kernel logs. That is why I ask for help.

LXD log doesn't show anything relevant either. These are the last messages of previous log file lxd.log.1 :

lvl=info msg="Done updating instance types" t=2018-10-22T01:54:31+0530
lvl=info msg="Updating images" t=2018-10-22T07:54:22+0530
lvl=info msg="Done updating images" t=2018-10-22T07:54:22+0530
lvl=info msg="Updating images" t=2018-10-22T13:54:22+0530
lvl=info msg="Done updating images" t=2018-10-22T13:54:22+0530

dmesg, if I am not incorrect, it shows only the information since booting

The isue is happenign on different hardware, I have HP ProLiant DL380p from 24GB of ram to 48GB of ram, Dell servers from 16GB of ram to 128GB of ram, blade servers with 4GB of RAM

It seems to happen more often in some servers that may be overloaded in regards to memory assigned for the running LXD continers (after all the respective containers are up there is hardly any memory left on the host. Could it be that SWAP is finished?

but last night it happened on a server that on average has a few GB free and no major CPU consuming container is running there.

As I mention, only LXD containers I have in the hosts, nothing else. in some I have a few KVMs, but hag ups happen mostly on servers where there is none of these.

Any help would be appreciated

Thanks

Steps to reproduce

NOT being able so far

Information to attach

[ ] Any relevant kernel output (dmesg)
[ ] Container log (lxc info NAME --show-log)
[ ] Container configuration (lxc config show NAME --expanded)
[ ] Main daemon log (at /var/log/lxd/lxd.log or /var/snap/lxd/common/lxd/logs/lxd.log)
[ ] Output of the client with --debug
[ ] Output of the daemon with --debug (alternatively output of lxc monitor while reproducing the issue)

This is the only software that I have installed on the hosts:

pdsh -R ssh -w ubuntu@10.3.4.[33] sudo apt install -y zfsutils-linux iperf speedtest-cli \
     arp-scan cpu-checker qemu qemu-kvm libvirt-bin bridge-utils zram-config tree pdsh snapd ncdu ntp \
     ntpdate nfs-common criu
pdsh -R ssh -w ubuntu@10.3.4.[48] sudo apt purge -y lxd lxd-client
nohup pdsh -R ssh -w ubuntu@10.3.4.[48] sudo snap install lxd &

stgraber commented 6 years ago

Hi,

I'm going to close this as it's not an actual LXD issue.

I do however have one of my own servers running into this about once a week, exact same symptoms as you described and exact same frustration at not having anything printed on the console.

Can you file a bug at https://launchpad.net/ubuntu/+source/linux/+filebug and post the link to it here?

It's going to be a very hard one to track down but I will provide any help I can since I'm also affected.

Note that in my case, I'm nowhere near running out of memory on that system, CPU isn't busy, storage is NVME SSD without any reported issue and I've actually replaced the entire hardware (moved to another server) and still run into this.

stgraber commented 6 years ago

@laralar https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1799497

canonical / lxd

Help on server getting stuck #5197

Required information

Issue description

Steps to reproduce

Information to attach