weaveworks / ignite

Ignite a Firecracker microVM
https://ignite.readthedocs.org
Apache License 2.0
3.49k stars 222 forks source link

Expose Firecracker metrics over the Prometheus socket #84

Open luxas opened 5 years ago

luxas commented 5 years ago

This issue builds upon #83, but aim to port the Firecracker metrics, as per the spec to Prometheus metrics

Telemetry: Firecracker emits logs and metrics to the named pipes passed to the logging API. Any logs and metrics emitted while their respective pipes are full will be lost. Any such events will be signaled through the lost-logs and lost-metrics counters.

Inside of the container, these metrics are written to a FIFO at path /tmp/firecracker_metrics.fifo, and they look something like this:

$ cat /tmp/firecracker_metrics.fifo 
{"utc_timestamp_ms":1561983533965,"api_server":{"process_startup_time_us":0,"process_startup_time_cpu_us":0,"sync_outcome_fails":0,"sync_vmm_send_timeout_count":0},"block":{"activate_fails":0,"cfg_fails":0,"event_fails":0,"execute_fails":0,"invalid_reqs_count":0,"flush_count":0,"queue_event_count":0,"rate_limiter_event_count":0,"update_count":0,"update_fails":0,"read_count":0,"write_count":0},"get_api_requests":{"instance_info_count":0,"instance_info_fails":0,"machine_cfg_count":2,"machine_cfg_fails":0},"i8042":{"error_count":0,"missed_read_count":0,"missed_write_count":0,"read_count":0,"reset_count":0,"write_count":0},"logger":{"missed_metrics_count":0,"metrics_fails":0,"missed_log_count":0,"log_fails":0},"mmds":{"rx_accepted":0,"rx_accepted_err":0,"rx_accepted_unusual":0,"rx_bad_eth":0,"tx_bytes":0,"tx_errors":0,"tx_frames":0,"connections_created":0,"connections_destroyed":0},"net":{"activate_fails":0,"cfg_fails":0,"event_fails":0,"rx_queue_event_count":0,"rx_event_rate_limiter_count":0,"rx_tap_event_count":0,"rx_bytes_count":0,"rx_packets_count":0,"rx_fails":0,"tx_bytes_count":0,"tx_fails":0,"tx_packets_count":0,"tx_queue_event_count":0,"tx_rate_limiter_event_count":0,"tx_spoofed_mac_count":0},"patch_api_requests":{"drive_count":0,"drive_fails":0,"network_count":0,"network_fails":0},"put_api_requests":{"actions_count":1,"actions_fails":0,"boot_source_count":1,"boot_source_fails":0,"drive_count":1,"drive_fails":0,"logger_count":1,"logger_fails":0,"machine_cfg_count":1,"machine_cfg_fails":0,"network_count":1,"network_fails":0},"seccomp":{"num_faults":0},"vcpu":{"exit_io_in":0,"exit_io_out":0,"exit_mmio_read":0,"exit_mmio_write":0,"failures":0,"fitler_cpuid":0},"vmm":{"device_events":0,"panic_count":0},"uart":{"error_count":0,"flush_count":0,"missed_read_count":0,"missed_write_count":0,"read_count":0,"write_count":0},"memory":{"dirty_pages":0}}
{"utc_timestamp_ms":1561983593965,"api_server":{"process_startup_time_us":0,"process_startup_time_cpu_us":0,"sync_outcome_fails":0,"sync_vmm_send_timeout_count":0},"block":{"activate_fails":0,"cfg_fails":0,"event_fails":0,"execute_fails":0,"invalid_reqs_count":0,"flush_count":23,"queue_event_count":4507,"rate_limiter_event_count":0,"update_count":0,"update_fails":0,"read_count":32349184,"write_count":241479680},"get_api_requests":{"instance_info_count":0,"instance_info_fails":0,"machine_cfg_count":0,"machine_cfg_fails":0},"i8042":{"error_count":0,"missed_read_count":0,"missed_write_count":6,"read_count":362,"reset_count":0,"write_count":123},"logger":{"missed_metrics_count":0,"metrics_fails":0,"missed_log_count":0,"log_fails":0},"mmds":{"rx_accepted":0,"rx_accepted_err":0,"rx_accepted_unusual":0,"rx_bad_eth":0,"tx_bytes":0,"tx_errors":0,"tx_frames":0,"connections_created":0,"connections_destroyed":0},"net":{"activate_fails":0,"cfg_fails":0,"event_fails":0,"rx_queue_event_count":6,"rx_event_rate_limiter_count":0,"rx_tap_event_count":62047,"rx_bytes_count":6630,"rx_packets_count":41,"rx_fails":0,"tx_bytes_count":2576,"tx_fails":0,"tx_packets_count":18,"tx_queue_event_count":17,"tx_rate_limiter_event_count":0,"tx_spoofed_mac_count":0},"patch_api_requests":{"drive_count":0,"drive_fails":0,"network_count":0,"network_fails":0},"put_api_requests":{"actions_count":0,"actions_fails":0,"boot_source_count":0,"boot_source_fails":0,"drive_count":0,"drive_fails":0,"logger_count":0,"logger_fails":0,"machine_cfg_count":0,"machine_cfg_fails":0,"network_count":0,"network_fails":0},"seccomp":{"num_faults":0},"vcpu":{"exit_io_in":17077,"exit_io_out":21224,"exit_mmio_read":2835,"exit_mmio_write":2842,"failures":0,"fitler_cpuid":0},"vmm":{"device_events":66577,"panic_count":0},"uart":{"error_count":0,"flush_count":18553,"missed_read_count":0,"missed_write_count":0,"read_count":70,"write_count":18553},"memory":{"dirty_pages":0}}
luxas commented 5 years ago

The formatted metrics JSON looks like this:

{
    "utc_timestamp_ms": 1562009792168,
    "api_server": {
        "process_startup_time_us": 0,
        "process_startup_time_cpu_us": 0,
        "sync_outcome_fails": 0,
        "sync_vmm_send_timeout_count": 0
    },
    "block": {
        "activate_fails": 0,
        "cfg_fails": 0,
        "event_fails": 0,
        "execute_fails": 0,
        "invalid_reqs_count": 0,
        "flush_count": 2,
        "queue_event_count": 4,
        "rate_limiter_event_count": 0,
        "update_count": 0,
        "update_fails": 0,
        "read_bytes": 0,
        "write_bytes": 3072,
        "read_count": 0,
        "write_count": 3
    },
    "get_api_requests": {
        "instance_info_count": 0,
        "instance_info_fails": 0,
        "machine_cfg_count": 0,
        "machine_cfg_fails": 0
    },
    "i8042": {
        "error_count": 0,
        "missed_read_count": 0,
        "missed_write_count": 0,
        "read_count": 0,
        "reset_count": 0,
        "write_count": 0
    },
    "logger": {
        "missed_metrics_count": 0,
        "metrics_fails": 0,
        "missed_log_count": 0,
        "log_fails": 0
    },
    "mmds": {
        "rx_accepted": 0,
        "rx_accepted_err": 0,
        "rx_accepted_unusual": 0,
        "rx_bad_eth": 0,
        "rx_count": 0,
        "tx_bytes": 0,
        "tx_count": 0,
        "tx_errors": 0,
        "tx_frames": 0,
        "connections_created": 0,
        "connections_destroyed": 0
    },
    "net": {
        "activate_fails": 0,
        "cfg_fails": 0,
        "event_fails": 0,
        "rx_queue_event_count": 1,
        "rx_event_rate_limiter_count": 0,
        "rx_tap_event_count": 8,
        "rx_bytes_count": 1325,
        "rx_packets_count": 8,
        "rx_fails": 0,
        "rx_count": 24,
        "tx_bytes_count": 292,
        "tx_fails": 0,
        "tx_count": 8,
        "tx_packets_count": 4,
        "tx_queue_event_count": 4,
        "tx_rate_limiter_event_count": 0,
        "tx_spoofed_mac_count": 0
    },
    "patch_api_requests": {
        "drive_count": 0,
        "drive_fails": 0,
        "network_count": 0,
        "network_fails": 0,
        "machine_cfg_count": 0,
        "machine_cfg_fails": 0
    },
    "put_api_requests": {
        "actions_count": 0,
        "actions_fails": 0,
        "boot_source_count": 0,
        "boot_source_fails": 0,
        "drive_count": 0,
        "drive_fails": 0,
        "logger_count": 0,
        "logger_fails": 0,
        "machine_cfg_count": 0,
        "machine_cfg_fails": 0,
        "network_count": 0,
        "network_fails": 0
    },
    "rtc": {
        "error_count": 0,
        "missed_read_count": 0,
        "missed_write_count": 0
    },
    "seccomp": {
        "num_faults": 0
    },
    "vcpu": {
        "exit_io_in": 0,
        "exit_io_out": 0,
        "exit_mmio_read": 12,
        "exit_mmio_write": 12,
        "failures": 0,
        "fitler_cpuid": 0
    },
    "vmm": {
        "device_events": 17,
        "panic_count": 0
    },
    "uart": {
        "error_count": 0,
        "flush_count": 0,
        "missed_read_count": 0,
        "missed_write_count": 0,
        "read_count": 0,
        "write_count": 0
    },
    "memory": {
        "dirty_pages": 0
    }
}

before looking at it, I had hoped to see a way to get VM CPU/memory usage, but that's not accessible here at least.

block read/write stats, and rx/tx network stats seem useful. maybe also dirty pages for memory

luxas commented 4 years ago

Converting the Firecracker metrics to Prometheus format would be a wonderful contribution from the community.