microsoft / pai

Resource scheduling and cluster management for AI
https://openpai.readthedocs.io
MIT License
2.63k stars 548 forks source link

Why grafana can not detect node number #931

Closed Beyyes closed 6 years ago

Beyyes commented 6 years ago

I have deployed a single node PAI service, but the Detected Node Number in grafana shows N/A as the picture below shows, how to solve this problem?

image

fanyangCS commented 6 years ago

can you check if node exporter is up?

Beyyes commented 6 years ago

@fanyangCS How to node check whether exporter is up?

9100 port could be reached. image

and k8s daemonset image

fanyangCS commented 6 years ago

it seems the webportal/grafana didn't get directed to the right address (Prometheus). Can you share the configuration yaml files?

Beyyes commented 6 years ago

I generated cluster configuration yaml files by using this quick-start.yaml.

# quick-start.yaml

# (Required) Please fill in the IP address of the server you would like to deploy PAI
machines:
  - 10.190.177.242

# (Required) Log-in info of all machines. System administrator should guarantee
# that the username/password pair is valid and has sudo privilege.
ssh-username: xxx   
ssh-password: xxx

# (Optional, default=22) Port number of ssh service on each machine.
#ssh-port: 22

# (Optional, default=DNS of the first machine) Cluster DNS.
#dns: <ip-of-dns>

# (Optional, default=10.254.0.0/16) IP range used by Kubernetes. Note that
# this IP range should NOT conflict with the current network.
#service-cluster-ip-range: <ip-range-for-k8s>

And this is metrics of port 9100.

# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 7.5201e-05
go_gc_duration_seconds{quantile="0.25"} 0.0001577
go_gc_duration_seconds{quantile="0.5"} 0.000197101
go_gc_duration_seconds{quantile="0.75"} 0.000376
go_gc_duration_seconds{quantile="1"} 0.00865964
go_gc_duration_seconds_sum 2.8585378710000002
go_gc_duration_seconds_count 4071
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 7
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.9.6"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 2.064096e+06
# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed.
# TYPE go_memstats_alloc_bytes_total counter
go_memstats_alloc_bytes_total 7.51268036e+09
# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table.
# TYPE go_memstats_buck_hash_sys_bytes gauge
go_memstats_buck_hash_sys_bytes 1.688016e+06
# HELP go_memstats_frees_total Total number of frees.
# TYPE go_memstats_frees_total counter
go_memstats_frees_total 4.3774816e+07
# HELP go_memstats_gc_cpu_fraction The fraction of this program's available CPU time used by the GC since the program started.
# TYPE go_memstats_gc_cpu_fraction gauge
go_memstats_gc_cpu_fraction 7.9334063037836e-05
# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata.
# TYPE go_memstats_gc_sys_bytes gauge
go_memstats_gc_sys_bytes 438272
# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use.
# TYPE go_memstats_heap_alloc_bytes gauge
go_memstats_heap_alloc_bytes 2.064096e+06
# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used.
# TYPE go_memstats_heap_idle_bytes gauge
go_memstats_heap_idle_bytes 3.375104e+06
# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use.
# TYPE go_memstats_heap_inuse_bytes gauge
go_memstats_heap_inuse_bytes 3.375104e+06
# HELP go_memstats_heap_objects Number of allocated objects.
# TYPE go_memstats_heap_objects gauge
go_memstats_heap_objects 11886
# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS.
# TYPE go_memstats_heap_released_bytes gauge
go_memstats_heap_released_bytes 0
# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system.
# TYPE go_memstats_heap_sys_bytes gauge
go_memstats_heap_sys_bytes 6.750208e+06
# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection.
# TYPE go_memstats_last_gc_time_seconds gauge
go_memstats_last_gc_time_seconds 1.5320653196369507e+09
# HELP go_memstats_lookups_total Total number of pointer lookups.
# TYPE go_memstats_lookups_total counter
go_memstats_lookups_total 200107
# HELP go_memstats_mallocs_total Total number of mallocs.
# TYPE go_memstats_mallocs_total counter
go_memstats_mallocs_total 4.3786702e+07
# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures.
# TYPE go_memstats_mcache_inuse_bytes gauge
go_memstats_mcache_inuse_bytes 6944
# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system.
# TYPE go_memstats_mcache_sys_bytes gauge
go_memstats_mcache_sys_bytes 16384
# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures.
# TYPE go_memstats_mspan_inuse_bytes gauge
go_memstats_mspan_inuse_bytes 45296
# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system.
# TYPE go_memstats_mspan_sys_bytes gauge
go_memstats_mspan_sys_bytes 65536
# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place.
# TYPE go_memstats_next_gc_bytes gauge
go_memstats_next_gc_bytes 4.194304e+06
# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations.
# TYPE go_memstats_other_sys_bytes gauge
go_memstats_other_sys_bytes 1.042216e+06
# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator.
# TYPE go_memstats_stack_inuse_bytes gauge
go_memstats_stack_inuse_bytes 589824
# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator.
# TYPE go_memstats_stack_sys_bytes gauge
go_memstats_stack_sys_bytes 589824
# HELP go_memstats_sys_bytes Number of bytes obtained from system.
# TYPE go_memstats_sys_bytes gauge
go_memstats_sys_bytes 1.0590456e+07
# HELP go_threads Number of OS threads created.
# TYPE go_threads gauge
go_threads 11
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="0",mode="user"} 0
node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="1",mode="user"} 0
node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="2",mode="user"} 0
node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0
node_cpu_guest_seconds_total{cpu="3",mode="user"} 0
# HELP node_cpu_seconds_total Seconds the cpus spent in each mode.
# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 4.92219748e+06
node_cpu_seconds_total{cpu="0",mode="iowait"} 517.22
node_cpu_seconds_total{cpu="0",mode="irq"} 0
node_cpu_seconds_total{cpu="0",mode="nice"} 495.76
node_cpu_seconds_total{cpu="0",mode="softirq"} 6757.58
node_cpu_seconds_total{cpu="0",mode="steal"} 0
node_cpu_seconds_total{cpu="0",mode="system"} 48754.81
node_cpu_seconds_total{cpu="0",mode="user"} 3.81528195e+06
node_cpu_seconds_total{cpu="1",mode="idle"} 4.91986948e+06
node_cpu_seconds_total{cpu="1",mode="iowait"} 429.08
node_cpu_seconds_total{cpu="1",mode="irq"} 0
node_cpu_seconds_total{cpu="1",mode="nice"} 503.6
node_cpu_seconds_total{cpu="1",mode="softirq"} 5218.27
node_cpu_seconds_total{cpu="1",mode="steal"} 0
node_cpu_seconds_total{cpu="1",mode="system"} 49014.24
node_cpu_seconds_total{cpu="1",mode="user"} 3.81716061e+06
node_cpu_seconds_total{cpu="2",mode="idle"} 4.91860991e+06
node_cpu_seconds_total{cpu="2",mode="iowait"} 503.96
node_cpu_seconds_total{cpu="2",mode="irq"} 0
node_cpu_seconds_total{cpu="2",mode="nice"} 507.89
node_cpu_seconds_total{cpu="2",mode="softirq"} 3571.47
node_cpu_seconds_total{cpu="2",mode="steal"} 0
node_cpu_seconds_total{cpu="2",mode="system"} 49144.03
node_cpu_seconds_total{cpu="2",mode="user"} 3.81910648e+06
node_cpu_seconds_total{cpu="3",mode="idle"} 4.92156675e+06
node_cpu_seconds_total{cpu="3",mode="iowait"} 439.57
node_cpu_seconds_total{cpu="3",mode="irq"} 0
node_cpu_seconds_total{cpu="3",mode="nice"} 538.19
node_cpu_seconds_total{cpu="3",mode="softirq"} 3328.01
node_cpu_seconds_total{cpu="3",mode="steal"} 0
node_cpu_seconds_total{cpu="3",mode="system"} 48970.28
node_cpu_seconds_total{cpu="3",mode="user"} 3.81701872e+06
# HELP node_disk_io_now The number of I/Os currently in progress.
# TYPE node_disk_io_now gauge
node_disk_io_now{device="sda"} 0
# HELP node_disk_io_time_seconds_total Total seconds spent doing I/Os.
# TYPE node_disk_io_time_seconds_total counter
node_disk_io_time_seconds_total{device="sda"} 5589.836
# HELP node_disk_io_time_weighted_seconds_total The weighted # of seconds spent doing I/Os. See https://www.kernel.org/doc/Documentation/iostats.txt.
# TYPE node_disk_io_time_weighted_seconds_total counter
node_disk_io_time_weighted_seconds_total{device="sda"} 27946.928
# HELP node_disk_read_bytes_total The total number of bytes read successfully.
# TYPE node_disk_read_bytes_total counter
node_disk_read_bytes_total{device="sda"} 1.23697901568e+11
# HELP node_disk_read_time_seconds_total The total number of milliseconds spent by all reads.
# TYPE node_disk_read_time_seconds_total counter
node_disk_read_time_seconds_total{device="sda"} 12804.596
# HELP node_disk_reads_completed_total The total number of reads completed successfully.
# TYPE node_disk_reads_completed_total counter
node_disk_reads_completed_total{device="sda"} 2.893499e+06
# HELP node_disk_reads_merged_total The total number of reads merged. See https://www.kernel.org/doc/Documentation/iostats.txt.
# TYPE node_disk_reads_merged_total counter
node_disk_reads_merged_total{device="sda"} 27303
# HELP node_disk_write_time_seconds_total This is the total number of seconds spent by all writes.
# TYPE node_disk_write_time_seconds_total counter
node_disk_write_time_seconds_total{device="sda"} 15112.972
# HELP node_disk_writes_completed_total The total number of writes completed successfully.
# TYPE node_disk_writes_completed_total counter
node_disk_writes_completed_total{device="sda"} 8.994577e+06
# HELP node_disk_writes_merged_total The number of writes merged. See https://www.kernel.org/doc/Documentation/iostats.txt.
# TYPE node_disk_writes_merged_total counter
node_disk_writes_merged_total{device="sda"} 1.1110421e+07
# HELP node_disk_written_bytes_total The total number of bytes written successfully.
# TYPE node_disk_written_bytes_total counter
node_disk_written_bytes_total{device="sda"} 1.475846590464e+12
# HELP node_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, and goversion from which node_exporter was built.
# TYPE node_exporter_build_info gauge
node_exporter_build_info{branch="HEAD",goversion="go1.9.6",revision="d42bd70f4363dced6b77d8fc311ea57b63387e4f",version="0.16.0"} 1
# HELP node_filefd_allocated File descriptor statistics: allocated.
# TYPE node_filefd_allocated gauge
node_filefd_allocated 7328
# HELP node_filefd_maximum File descriptor statistics: maximum.
# TYPE node_filefd_maximum gauge
node_filefd_maximum 1.634235e+06
# HELP node_filesystem_avail_bytes Filesystem space available to non-root users in bytes.
# TYPE node_filesystem_avail_bytes gauge
node_filesystem_avail_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 8.62972973056e+11
node_filesystem_avail_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 8.62972973056e+11
node_filesystem_avail_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 8.62972973056e+11
node_filesystem_avail_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 8.62972973056e+11
node_filesystem_avail_bytes{device="none",fstype="aufs",mountpoint="/"} 8.62972973056e+11
# HELP node_filesystem_device_error Whether an error occurred while getting statistics for the given device.
# TYPE node_filesystem_device_error gauge
node_filesystem_device_error{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 0
node_filesystem_device_error{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 0
node_filesystem_device_error{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 0
node_filesystem_device_error{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 0
node_filesystem_device_error{device="none",fstype="aufs",mountpoint="/"} 0
# HELP node_filesystem_files Filesystem total file nodes.
# TYPE node_filesystem_files gauge
node_filesystem_files{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 6.5011712e+07
node_filesystem_files{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 6.5011712e+07
node_filesystem_files{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 6.5011712e+07
node_filesystem_files{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 6.5011712e+07
node_filesystem_files{device="none",fstype="aufs",mountpoint="/"} 6.5011712e+07
# HELP node_filesystem_files_free Filesystem total free file nodes.
# TYPE node_filesystem_files_free gauge
node_filesystem_files_free{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 6.4099836e+07
node_filesystem_files_free{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 6.4099836e+07
node_filesystem_files_free{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 6.4099836e+07
node_filesystem_files_free{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 6.4099836e+07
node_filesystem_files_free{device="none",fstype="aufs",mountpoint="/"} 6.4099836e+07
# HELP node_filesystem_free_bytes Filesystem free space in bytes.
# TYPE node_filesystem_free_bytes gauge
node_filesystem_free_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 9.16247289856e+11
node_filesystem_free_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 9.16247289856e+11
node_filesystem_free_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 9.16247289856e+11
node_filesystem_free_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 9.16247289856e+11
node_filesystem_free_bytes{device="none",fstype="aufs",mountpoint="/"} 9.16247289856e+11
# HELP node_filesystem_readonly Filesystem read-only status.
# TYPE node_filesystem_readonly gauge
node_filesystem_readonly{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 0
node_filesystem_readonly{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 0
node_filesystem_readonly{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 0
node_filesystem_readonly{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 0
node_filesystem_readonly{device="none",fstype="aufs",mountpoint="/"} 0
# HELP node_filesystem_size_bytes Filesystem size in bytes.
# TYPE node_filesystem_size_bytes gauge
node_filesystem_size_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/datastorage/prometheus"} 1.048303710208e+12
node_filesystem_size_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hostname"} 1.048303710208e+12
node_filesystem_size_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/hosts"} 1.048303710208e+12
node_filesystem_size_bytes{device="/dev/sda1",fstype="ext4",mountpoint="/etc/resolv.conf"} 1.048303710208e+12
node_filesystem_size_bytes{device="none",fstype="aufs",mountpoint="/"} 1.048303710208e+12
# HELP node_load1 1m load average.
# TYPE node_load1 gauge
node_load1 0.75
# HELP node_load15 15m load average.
# TYPE node_load15 gauge
node_load15 0.86
# HELP node_load5 5m load average.
# TYPE node_load5 gauge
node_load5 0.85
# HELP node_memory_Active_anon_bytes Memory information field Active_anon_bytes.
# TYPE node_memory_Active_anon_bytes gauge
node_memory_Active_anon_bytes 5.861322752e+09
# HELP node_memory_Active_bytes Memory information field Active_bytes.
# TYPE node_memory_Active_bytes gauge
node_memory_Active_bytes 8.517496832e+09
# HELP node_memory_Active_file_bytes Memory information field Active_file_bytes.
# TYPE node_memory_Active_file_bytes gauge
node_memory_Active_file_bytes 2.65617408e+09
# HELP node_memory_AnonHugePages_bytes Memory information field AnonHugePages_bytes.
# TYPE node_memory_AnonHugePages_bytes gauge
node_memory_AnonHugePages_bytes 4.64519168e+09
# HELP node_memory_AnonPages_bytes Memory information field AnonPages_bytes.
# TYPE node_memory_AnonPages_bytes gauge
node_memory_AnonPages_bytes 5.788647424e+09
# HELP node_memory_Bounce_bytes Memory information field Bounce_bytes.
# TYPE node_memory_Bounce_bytes gauge
node_memory_Bounce_bytes 0
# HELP node_memory_Buffers_bytes Memory information field Buffers_bytes.
# TYPE node_memory_Buffers_bytes gauge
node_memory_Buffers_bytes 6.38459904e+08
# HELP node_memory_Cached_bytes Memory information field Cached_bytes.
# TYPE node_memory_Cached_bytes gauge
node_memory_Cached_bytes 7.627014144e+09
# HELP node_memory_CmaFree_bytes Memory information field CmaFree_bytes.
# TYPE node_memory_CmaFree_bytes gauge
node_memory_CmaFree_bytes 0
# HELP node_memory_CmaTotal_bytes Memory information field CmaTotal_bytes.
# TYPE node_memory_CmaTotal_bytes gauge
node_memory_CmaTotal_bytes 0
# HELP node_memory_CommitLimit_bytes Memory information field CommitLimit_bytes.
# TYPE node_memory_CommitLimit_bytes gauge
node_memory_CommitLimit_bytes 8.408190976e+09
# HELP node_memory_Committed_AS_bytes Memory information field Committed_AS_bytes.
# TYPE node_memory_Committed_AS_bytes gauge
node_memory_Committed_AS_bytes 1.8053476352e+10
# HELP node_memory_DirectMap2M_bytes Memory information field DirectMap2M_bytes.
# TYPE node_memory_DirectMap2M_bytes gauge
node_memory_DirectMap2M_bytes 1.6940793856e+10
# HELP node_memory_DirectMap4k_bytes Memory information field DirectMap4k_bytes.
# TYPE node_memory_DirectMap4k_bytes gauge
node_memory_DirectMap4k_bytes 2.3691264e+08
# HELP node_memory_Dirty_bytes Memory information field Dirty_bytes.
# TYPE node_memory_Dirty_bytes gauge
node_memory_Dirty_bytes 204800
# HELP node_memory_HardwareCorrupted_bytes Memory information field HardwareCorrupted_bytes.
# TYPE node_memory_HardwareCorrupted_bytes gauge
node_memory_HardwareCorrupted_bytes 0
# HELP node_memory_HugePages_Free Memory information field HugePages_Free.
# TYPE node_memory_HugePages_Free gauge
node_memory_HugePages_Free 0
# HELP node_memory_HugePages_Rsvd Memory information field HugePages_Rsvd.
# TYPE node_memory_HugePages_Rsvd gauge
node_memory_HugePages_Rsvd 0
# HELP node_memory_HugePages_Surp Memory information field HugePages_Surp.
# TYPE node_memory_HugePages_Surp gauge
node_memory_HugePages_Surp 0
# HELP node_memory_HugePages_Total Memory information field HugePages_Total.
# TYPE node_memory_HugePages_Total gauge
node_memory_HugePages_Total 0
# HELP node_memory_Hugepagesize_bytes Memory information field Hugepagesize_bytes.
# TYPE node_memory_Hugepagesize_bytes gauge
node_memory_Hugepagesize_bytes 2.097152e+06
# HELP node_memory_Inactive_anon_bytes Memory information field Inactive_anon_bytes.
# TYPE node_memory_Inactive_anon_bytes gauge
node_memory_Inactive_anon_bytes 1.07945984e+08
# HELP node_memory_Inactive_bytes Memory information field Inactive_bytes.
# TYPE node_memory_Inactive_bytes gauge
node_memory_Inactive_bytes 5.53265152e+09
# HELP node_memory_Inactive_file_bytes Memory information field Inactive_file_bytes.
# TYPE node_memory_Inactive_file_bytes gauge
node_memory_Inactive_file_bytes 5.424705536e+09
# HELP node_memory_KernelStack_bytes Memory information field KernelStack_bytes.
# TYPE node_memory_KernelStack_bytes gauge
node_memory_KernelStack_bytes 3.1604736e+07
# HELP node_memory_Mapped_bytes Memory information field Mapped_bytes.
# TYPE node_memory_Mapped_bytes gauge
node_memory_Mapped_bytes 4.65915904e+08
# HELP node_memory_MemAvailable_bytes Memory information field MemAvailable_bytes.
# TYPE node_memory_MemAvailable_bytes gauge
node_memory_MemAvailable_bytes 1.0406473728e+10
# HELP node_memory_MemFree_bytes Memory information field MemFree_bytes.
# TYPE node_memory_MemFree_bytes gauge
node_memory_MemFree_bytes 1.737654272e+09
# HELP node_memory_MemTotal_bytes Memory information field MemTotal_bytes.
# TYPE node_memory_MemTotal_bytes gauge
node_memory_MemTotal_bytes 1.6816386048e+10
# HELP node_memory_Mlocked_bytes Memory information field Mlocked_bytes.
# TYPE node_memory_Mlocked_bytes gauge
node_memory_Mlocked_bytes 3.739648e+06
# HELP node_memory_NFS_Unstable_bytes Memory information field NFS_Unstable_bytes.
# TYPE node_memory_NFS_Unstable_bytes gauge
node_memory_NFS_Unstable_bytes 0
# HELP node_memory_PageTables_bytes Memory information field PageTables_bytes.
# TYPE node_memory_PageTables_bytes gauge
node_memory_PageTables_bytes 3.1895552e+07
# HELP node_memory_SReclaimable_bytes Memory information field SReclaimable_bytes.
# TYPE node_memory_SReclaimable_bytes gauge
node_memory_SReclaimable_bytes 6.499328e+08
# HELP node_memory_SUnreclaim_bytes Memory information field SUnreclaim_bytes.
# TYPE node_memory_SUnreclaim_bytes gauge
node_memory_SUnreclaim_bytes 1.77627136e+08
# HELP node_memory_Shmem_bytes Memory information field Shmem_bytes.
# TYPE node_memory_Shmem_bytes gauge
node_memory_Shmem_bytes 1.82112256e+08
# HELP node_memory_Slab_bytes Memory information field Slab_bytes.
# TYPE node_memory_Slab_bytes gauge
node_memory_Slab_bytes 8.27559936e+08
# HELP node_memory_SwapCached_bytes Memory information field SwapCached_bytes.
# TYPE node_memory_SwapCached_bytes gauge
node_memory_SwapCached_bytes 0
# HELP node_memory_SwapFree_bytes Memory information field SwapFree_bytes.
# TYPE node_memory_SwapFree_bytes gauge
node_memory_SwapFree_bytes 0
# HELP node_memory_SwapTotal_bytes Memory information field SwapTotal_bytes.
# TYPE node_memory_SwapTotal_bytes gauge
node_memory_SwapTotal_bytes 0
# HELP node_memory_Unevictable_bytes Memory information field Unevictable_bytes.
# TYPE node_memory_Unevictable_bytes gauge
node_memory_Unevictable_bytes 3.739648e+06
# HELP node_memory_VmallocChunk_bytes Memory information field VmallocChunk_bytes.
# TYPE node_memory_VmallocChunk_bytes gauge
node_memory_VmallocChunk_bytes 0
# HELP node_memory_VmallocTotal_bytes Memory information field VmallocTotal_bytes.
# TYPE node_memory_VmallocTotal_bytes gauge
node_memory_VmallocTotal_bytes 3.5184372087808e+13
# HELP node_memory_VmallocUsed_bytes Memory information field VmallocUsed_bytes.
# TYPE node_memory_VmallocUsed_bytes gauge
node_memory_VmallocUsed_bytes 0
# HELP node_memory_WritebackTmp_bytes Memory information field WritebackTmp_bytes.
# TYPE node_memory_WritebackTmp_bytes gauge
node_memory_WritebackTmp_bytes 0
# HELP node_memory_Writeback_bytes Memory information field Writeback_bytes.
# TYPE node_memory_Writeback_bytes gauge
node_memory_Writeback_bytes 0
# HELP node_netstat_Icmp6_InErrors Statistic Icmp6InErrors.
# TYPE node_netstat_Icmp6_InErrors untyped
node_netstat_Icmp6_InErrors 0
# HELP node_netstat_Icmp6_InMsgs Statistic Icmp6InMsgs.
# TYPE node_netstat_Icmp6_InMsgs untyped
node_netstat_Icmp6_InMsgs 4109
# HELP node_netstat_Icmp6_OutMsgs Statistic Icmp6OutMsgs.
# TYPE node_netstat_Icmp6_OutMsgs untyped
node_netstat_Icmp6_OutMsgs 363
# HELP node_netstat_Icmp_InErrors Statistic IcmpInErrors.
# TYPE node_netstat_Icmp_InErrors untyped
node_netstat_Icmp_InErrors 2.496608e+06
# HELP node_netstat_Icmp_InMsgs Statistic IcmpInMsgs.
# TYPE node_netstat_Icmp_InMsgs untyped
node_netstat_Icmp_InMsgs 5.908502e+06
# HELP node_netstat_Icmp_OutMsgs Statistic IcmpOutMsgs.
# TYPE node_netstat_Icmp_OutMsgs untyped
node_netstat_Icmp_OutMsgs 6.190969e+06
# HELP node_netstat_Ip6_InOctets Statistic Ip6InOctets.
# TYPE node_netstat_Ip6_InOctets untyped
node_netstat_Ip6_InOctets 3.226324775e+09
# HELP node_netstat_Ip6_OutOctets Statistic Ip6OutOctets.
# TYPE node_netstat_Ip6_OutOctets untyped
node_netstat_Ip6_OutOctets 26240
# HELP node_netstat_IpExt_InOctets Statistic IpExtInOctets.
# TYPE node_netstat_IpExt_InOctets untyped
node_netstat_IpExt_InOctets 4.42394781902e+11
# HELP node_netstat_IpExt_OutOctets Statistic IpExtOutOctets.
# TYPE node_netstat_IpExt_OutOctets untyped
node_netstat_IpExt_OutOctets 4.67805939982e+11
# HELP node_netstat_Ip_Forwarding Statistic IpForwarding.
# TYPE node_netstat_Ip_Forwarding untyped
node_netstat_Ip_Forwarding 1
# HELP node_netstat_TcpExt_ListenDrops Statistic TcpExtListenDrops.
# TYPE node_netstat_TcpExt_ListenDrops untyped
node_netstat_TcpExt_ListenDrops 0
# HELP node_netstat_TcpExt_ListenOverflows Statistic TcpExtListenOverflows.
# TYPE node_netstat_TcpExt_ListenOverflows untyped
node_netstat_TcpExt_ListenOverflows 0
# HELP node_netstat_TcpExt_SyncookiesFailed Statistic TcpExtSyncookiesFailed.
# TYPE node_netstat_TcpExt_SyncookiesFailed untyped
node_netstat_TcpExt_SyncookiesFailed 0
# HELP node_netstat_TcpExt_SyncookiesRecv Statistic TcpExtSyncookiesRecv.
# TYPE node_netstat_TcpExt_SyncookiesRecv untyped
node_netstat_TcpExt_SyncookiesRecv 0
# HELP node_netstat_TcpExt_SyncookiesSent Statistic TcpExtSyncookiesSent.
# TYPE node_netstat_TcpExt_SyncookiesSent untyped
node_netstat_TcpExt_SyncookiesSent 0
# HELP node_netstat_Tcp_ActiveOpens Statistic TcpActiveOpens.
# TYPE node_netstat_Tcp_ActiveOpens untyped
node_netstat_Tcp_ActiveOpens 3.1971469e+07
# HELP node_netstat_Tcp_CurrEstab Statistic TcpCurrEstab.
# TYPE node_netstat_Tcp_CurrEstab untyped
node_netstat_Tcp_CurrEstab 269
# HELP node_netstat_Tcp_InErrs Statistic TcpInErrs.
# TYPE node_netstat_Tcp_InErrs untyped
node_netstat_Tcp_InErrs 0
# HELP node_netstat_Tcp_PassiveOpens Statistic TcpPassiveOpens.
# TYPE node_netstat_Tcp_PassiveOpens untyped
node_netstat_Tcp_PassiveOpens 2.378727e+06
# HELP node_netstat_Tcp_RetransSegs Statistic TcpRetransSegs.
# TYPE node_netstat_Tcp_RetransSegs untyped
node_netstat_Tcp_RetransSegs 7.506293e+06
# HELP node_netstat_Udp6_InDatagrams Statistic Udp6InDatagrams.
# TYPE node_netstat_Udp6_InDatagrams untyped
node_netstat_Udp6_InDatagrams 0
# HELP node_netstat_Udp6_InErrors Statistic Udp6InErrors.
# TYPE node_netstat_Udp6_InErrors untyped
node_netstat_Udp6_InErrors 4
# HELP node_netstat_Udp6_NoPorts Statistic Udp6NoPorts.
# TYPE node_netstat_Udp6_NoPorts untyped
node_netstat_Udp6_NoPorts 4
# HELP node_netstat_Udp6_OutDatagrams Statistic Udp6OutDatagrams.
# TYPE node_netstat_Udp6_OutDatagrams untyped
node_netstat_Udp6_OutDatagrams 4
# HELP node_netstat_UdpLite6_InErrors Statistic UdpLite6InErrors.
# TYPE node_netstat_UdpLite6_InErrors untyped
node_netstat_UdpLite6_InErrors 0
# HELP node_netstat_UdpLite_InErrors Statistic UdpLiteInErrors.
# TYPE node_netstat_UdpLite_InErrors untyped
node_netstat_UdpLite_InErrors 0
# HELP node_netstat_Udp_InDatagrams Statistic UdpInDatagrams.
# TYPE node_netstat_Udp_InDatagrams untyped
node_netstat_Udp_InDatagrams 3.365287e+06
# HELP node_netstat_Udp_InErrors Statistic UdpInErrors.
# TYPE node_netstat_Udp_InErrors untyped
node_netstat_Udp_InErrors 0
# HELP node_netstat_Udp_NoPorts Statistic UdpNoPorts.
# TYPE node_netstat_Udp_NoPorts untyped
node_netstat_Udp_NoPorts 3786
# HELP node_netstat_Udp_OutDatagrams Statistic UdpOutDatagrams.
# TYPE node_netstat_Udp_OutDatagrams untyped
node_netstat_Udp_OutDatagrams 896283
# HELP node_scrape_collector_duration_seconds node_exporter: Duration of a collector scrape.
# TYPE node_scrape_collector_duration_seconds gauge
node_scrape_collector_duration_seconds{collector="cpu"} 0.001076214
node_scrape_collector_duration_seconds{collector="diskstats"} 0.003316543
node_scrape_collector_duration_seconds{collector="filefd"} 9.7401e-05
node_scrape_collector_duration_seconds{collector="filesystem"} 0.001775923
node_scrape_collector_duration_seconds{collector="loadavg"} 0.000258103
node_scrape_collector_duration_seconds{collector="meminfo"} 0.000404105
node_scrape_collector_duration_seconds{collector="netstat"} 0.002575634
node_scrape_collector_duration_seconds{collector="textfile"} 0.000178502
# HELP node_scrape_collector_success node_exporter: Whether a collector succeeded.
# TYPE node_scrape_collector_success gauge
node_scrape_collector_success{collector="cpu"} 1
node_scrape_collector_success{collector="diskstats"} 1
node_scrape_collector_success{collector="filefd"} 1
node_scrape_collector_success{collector="filesystem"} 1
node_scrape_collector_success{collector="loadavg"} 1
node_scrape_collector_success{collector="meminfo"} 1
node_scrape_collector_success{collector="netstat"} 1
node_scrape_collector_success{collector="textfile"} 1
# HELP node_textfile_mtime_seconds Unixtime mtime of textfiles successfully read.
# TYPE node_textfile_mtime_seconds gauge
node_textfile_mtime_seconds{file="job_exporter.prom"} 1.532065302e+09
# HELP node_textfile_scrape_error 1 if there was an error opening or reading a file, 0 otherwise
# TYPE node_textfile_scrape_error gauge
node_textfile_scrape_error 0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 82.21
# HELP process_max_fds Maximum number of open file descriptors.
# TYPE process_max_fds gauge
process_max_fds 1.048576e+06
# HELP process_open_fds Number of open file descriptors.
# TYPE process_open_fds gauge
process_open_fds 8
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 2.3105536e+07
# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.53200400052e+09
# HELP process_virtual_memory_bytes Virtual memory size in bytes.
# TYPE process_virtual_memory_bytes gauge
process_virtual_memory_bytes 2.98745856e+08
# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served.
# TYPE promhttp_metric_handler_requests_in_flight gauge
promhttp_metric_handler_requests_in_flight 1
# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
# TYPE promhttp_metric_handler_requests_total counter
promhttp_metric_handler_requests_total{code="200"} 0
promhttp_metric_handler_requests_total{code="500"} 0
promhttp_metric_handler_requests_total{code="503"} 0

And this is the pod yaml of grafana.

{
  "kind": "Pod",
  "apiVersion": "v1",
  "metadata": {
    "name": "grafana-86974c6b5d-mxqzp",
    "generateName": "grafana-86974c6b5d-",
    "namespace": "default",
    "selfLink": "/api/v1/namespaces/default/pods/grafana-86974c6b5d-mxqzp",
    "uid": "7bafa31d-8b50-11e8-af86-00155d64411c",
    "resourceVersion": "1743",
    "creationTimestamp": "2018-07-19T12:37:22Z",
    "labels": {
      "k8s-app": "grafana",
      "pod-template-hash": "4253072618",
      "task": "monitor"
    },
    "ownerReferences": [
      {
        "apiVersion": "extensions/v1beta1",
        "kind": "ReplicaSet",
        "name": "grafana-86974c6b5d",
        "uid": "7bab4cc8-8b50-11e8-af86-00155d64411c",
        "controller": true,
        "blockOwnerDeletion": true
      }
    ]
  },
  "spec": {
    "volumes": [
      {
        "name": "grafana-confg-volume",
        "configMap": {
          "name": "grafana-configuration",
          "defaultMode": 420
        }
      }
    ],
    "containers": [
      {
        "name": "grafana",
        "image": "docker.io/openpai/grafana:latest",
        "ports": [
          {
            "hostPort": 3000,
            "containerPort": 3000,
            "protocol": "TCP"
          }
        ],
        "env": [
          {
            "name": "GRAFANA_URL",
            "value": "http://10.190.177.242:3000"
          },
          {
            "name": "GF_AUTH_ANONYMOUS_ENABLED",
            "value": "true"
          }
        ],
        "resources": {},
        "volumeMounts": [
          {
            "name": "grafana-confg-volume",
            "mountPath": "/grafana-configuration"
          }
        ],
        "terminationMessagePath": "/dev/termination-log",
        "terminationMessagePolicy": "File",
        "imagePullPolicy": "Always"
      }
    ],
    "restartPolicy": "Always",
    "terminationGracePeriodSeconds": 30,
    "dnsPolicy": "ClusterFirst",
    "nodeSelector": {
      "grafana": "true"
    },
    "nodeName": "10.190.177.242",
    "hostNetwork": true,
    "hostPID": true,
    "securityContext": {},
    "imagePullSecrets": [
      {
        "name": "pai-secret"
      }
    ],
    "schedulerName": "default-scheduler"
  },
  "status": {
    "phase": "Running",
    "conditions": [
      {
        "type": "Initialized",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:37:22Z"
      },
      {
        "type": "Ready",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:41:40Z"
      },
      {
        "type": "PodScheduled",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:37:22Z"
      }
    ],
    "hostIP": "10.190.177.242",
    "podIP": "10.190.177.242",
    "startTime": "2018-07-19T12:37:22Z",
    "containerStatuses": [
      {
        "name": "grafana",
        "state": {
          "running": {
            "startedAt": "2018-07-19T12:41:40Z"
          }
        },
        "lastState": {},
        "ready": true,
        "restartCount": 0,
        "image": "openpai/grafana:latest",
        "imageID": "docker-pullable://openpai/grafana@sha256:c6008793ee0bb0b0845fdef03d96a9e8f089885b152a070bff6aa48228aa2eff",
        "containerID": "docker://04476e128a957421b04f7870923dae39e2dd799291e6e71c77e659b9b202d95d"
      }
    ],
    "qosClass": "BestEffort"
  }
}

And this is the pod yaml of prometheus-deployment.

{
  "kind": "Pod",
  "apiVersion": "v1",
  "metadata": {
    "name": "prometheus-deployment-565c654c7b-9b9cl",
    "generateName": "prometheus-deployment-565c654c7b-",
    "namespace": "default",
    "selfLink": "/api/v1/namespaces/default/pods/prometheus-deployment-565c654c7b-9b9cl",
    "uid": "7a5eda10-8b50-11e8-af86-00155d64411c",
    "resourceVersion": "1642",
    "creationTimestamp": "2018-07-19T12:37:20Z",
    "labels": {
      "app": "prometheus",
      "pod-template-hash": "1217210736"
    },
    "ownerReferences": [
      {
        "apiVersion": "extensions/v1beta1",
        "kind": "ReplicaSet",
        "name": "prometheus-deployment-565c654c7b",
        "uid": "7a5e56b9-8b50-11e8-af86-00155d64411c",
        "controller": true,
        "blockOwnerDeletion": true
      }
    ]
  },
  "spec": {
    "volumes": [
      {
        "name": "config-volume",
        "configMap": {
          "name": "prometheus-configmap",
          "defaultMode": 420
        }
      }
    ],
    "containers": [
      {
        "name": "prometheus",
        "image": "prom/prometheus:v2.1.0",
        "args": [
          "--config.file=/etc/prometheus/prometheus.yml",
          "--web.listen-address=0.0.0.0:9091"
        ],
        "ports": [
          {
            "name": "web",
            "hostPort": 9091,
            "containerPort": 9091,
            "protocol": "TCP"
          }
        ],
        "resources": {
          "limits": {
            "memory": "10Gi"
          },
          "requests": {
            "memory": "10Gi"
          }
        },
        "volumeMounts": [
          {
            "name": "config-volume",
            "mountPath": "/etc/prometheus"
          }
        ],
        "terminationMessagePath": "/dev/termination-log",
        "terminationMessagePolicy": "File",
        "imagePullPolicy": "IfNotPresent"
      }
    ],
    "restartPolicy": "Always",
    "terminationGracePeriodSeconds": 30,
    "dnsPolicy": "ClusterFirst",
    "nodeSelector": {
      "prometheus": "true"
    },
    "nodeName": "10.190.177.242",
    "hostNetwork": true,
    "hostPID": true,
    "securityContext": {},
    "schedulerName": "default-scheduler"
  },
  "status": {
    "phase": "Running",
    "conditions": [
      {
        "type": "Initialized",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:37:20Z"
      },
      {
        "type": "Ready",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:40:14Z"
      },
      {
        "type": "PodScheduled",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:37:20Z"
      }
    ],
    "hostIP": "10.190.177.242",
    "podIP": "10.190.177.242",
    "startTime": "2018-07-19T12:37:20Z",
    "containerStatuses": [
      {
        "name": "prometheus",
        "state": {
          "running": {
            "startedAt": "2018-07-19T12:40:13Z"
          }
        },
        "lastState": {},
        "ready": true,
        "restartCount": 0,
        "image": "prom/prometheus:v2.1.0",
        "imageID": "docker-pullable://prom/prometheus@sha256:7b987901dbc44d17a88e7bda42dbbbb743c161e3152662959acd9f35aeefb9a3",
        "containerID": "docker://37e72fa116d1181597aa70661ae6596ecf680eac8c4df805c63ba6d75d784e02"
      }
    ],
    "qosClass": "Burstable"
  }
}

And this is the pod yaml of node-exporter.

{
  "kind": "Pod",
  "apiVersion": "v1",
  "metadata": {
    "name": "node-exporter-z8h64",
    "generateName": "node-exporter-",
    "namespace": "default",
    "selfLink": "/api/v1/namespaces/default/pods/node-exporter-z8h64",
    "uid": "7a259793-8b50-11e8-af86-00155d64411c",
    "resourceVersion": "1904",
    "creationTimestamp": "2018-07-19T12:37:19Z",
    "labels": {
      "app": "node-exporter",
      "controller-revision-hash": "273467173",
      "pod-template-generation": "1"
    },
    "ownerReferences": [
      {
        "apiVersion": "extensions/v1beta1",
        "kind": "DaemonSet",
        "name": "node-exporter",
        "uid": "7a23b99f-8b50-11e8-af86-00155d64411c",
        "controller": true,
        "blockOwnerDeletion": true
      }
    ]
  },
  "spec": {
    "volumes": [
      {
        "name": "docker-bin",
        "hostPath": {
          "path": "/bin/docker",
          "type": ""
        }
      },
      {
        "name": "docker-socket",
        "hostPath": {
          "path": "/var/run/docker.sock",
          "type": ""
        }
      },
      {
        "name": "docker-cred-volume",
        "configMap": {
          "name": "docker-credentials",
          "defaultMode": 420
        }
      },
      {
        "name": "device-mount",
        "hostPath": {
          "path": "/dev",
          "type": ""
        }
      },
      {
        "name": "driver-path",
        "hostPath": {
          "path": "/var/drivers",
          "type": ""
        }
      },
      {
        "name": "collector-mount",
        "hostPath": {
          "path": "/datastorage/prometheus",
          "type": ""
        }
      },
      {
        "name": "rootfs",
        "hostPath": {
          "path": "/",
          "type": ""
        }
      },
      {
        "name": "var-run",
        "hostPath": {
          "path": "/var/run",
          "type": ""
        }
      },
      {
        "name": "sys",
        "hostPath": {
          "path": "/sys",
          "type": ""
        }
      },
      {
        "name": "docker",
        "hostPath": {
          "path": "/var/lib/docker",
          "type": ""
        }
      }
    ],
    "containers": [
      {
        "name": "node-exporter",
        "image": "prom/node-exporter:v0.16.0",
        "args": [
          "--collector.textfile.directory=/datastorage/prometheus",
          "--no-collector.arp",
          "--no-collector.bcache",
          "--no-collector.bonding",
          "--no-collector.conntrack",
          "--no-collector.edac",
          "--no-collector.entropy",
          "--no-collector.hwmon",
          "--no-collector.infiniband",
          "--no-collector.ipvs",
          "--no-collector.mdadm",
          "--no-collector.netdev",
          "--no-collector.nfs",
          "--no-collector.nfsd",
          "--no-collector.sockstat",
          "--no-collector.stat",
          "--no-collector.time",
          "--no-collector.timex",
          "--no-collector.uname",
          "--no-collector.vmstat",
          "--no-collector.wifi",
          "--no-collector.xfs",
          "--no-collector.zfs"
        ],
        "ports": [
          {
            "name": "scrape",
            "hostPort": 9100,
            "containerPort": 9100,
            "protocol": "TCP"
          }
        ],
        "resources": {
          "limits": {
            "memory": "1Gi"
          },
          "requests": {
            "memory": "1Gi"
          }
        },
        "volumeMounts": [
          {
            "name": "collector-mount",
            "mountPath": "/datastorage/prometheus"
          }
        ],
        "readinessProbe": {
          "httpGet": {
            "path": "/metrics",
            "port": 9100,
            "scheme": "HTTP"
          },
          "initialDelaySeconds": 30,
          "timeoutSeconds": 1,
          "periodSeconds": 30,
          "successThreshold": 1,
          "failureThreshold": 3
        },
        "terminationMessagePath": "/dev/termination-log",
        "terminationMessagePolicy": "File",
        "imagePullPolicy": "Always"
      },
      {
        "name": "gpu-exporter",
        "image": "docker.io/openpai/gpu-exporter:latest",
        "resources": {
          "limits": {
            "memory": "1Gi"
          },
          "requests": {
            "memory": "1Gi"
          }
        },
        "volumeMounts": [
          {
            "name": "docker-cred-volume",
            "mountPath": "/root/.docker"
          },
          {
            "name": "docker-bin",
            "mountPath": "/bin/docker"
          },
          {
            "name": "docker-socket",
            "mountPath": "/var/run/docker.sock"
          },
          {
            "name": "docker",
            "mountPath": "/var/lib/docker"
          },
          {
            "name": "device-mount",
            "mountPath": "/dev"
          },
          {
            "name": "driver-path",
            "mountPath": "/var/drivers"
          },
          {
            "name": "collector-mount",
            "mountPath": "/datastorage/prometheus"
          }
        ],
        "readinessProbe": {
          "exec": {
            "command": [
              "python",
              "/usr/local/healthy_check.py"
            ]
          },
          "initialDelaySeconds": 30,
          "timeoutSeconds": 1,
          "periodSeconds": 30,
          "successThreshold": 1,
          "failureThreshold": 3
        },
        "terminationMessagePath": "/dev/termination-log",
        "terminationMessagePolicy": "File",
        "imagePullPolicy": "Always",
        "securityContext": {
          "privileged": true
        }
      }
    ],
    "restartPolicy": "Always",
    "terminationGracePeriodSeconds": 30,
    "dnsPolicy": "ClusterFirst",
    "nodeSelector": {
      "node-exporter": "true"
    },
    "nodeName": "10.190.177.242",
    "hostNetwork": true,
    "hostPID": true,
    "securityContext": {},
    "imagePullSecrets": [
      {
        "name": "pai-secret"
      }
    ],
    "schedulerName": "default-scheduler",
    "tolerations": [
      {
        "key": "node.kubernetes.io/not-ready",
        "operator": "Exists",
        "effect": "NoExecute"
      },
      {
        "key": "node.kubernetes.io/unreachable",
        "operator": "Exists",
        "effect": "NoExecute"
      },
      {
        "key": "node.kubernetes.io/disk-pressure",
        "operator": "Exists",
        "effect": "NoSchedule"
      },
      {
        "key": "node.kubernetes.io/memory-pressure",
        "operator": "Exists",
        "effect": "NoSchedule"
      }
    ]
  },
  "status": {
    "phase": "Running",
    "conditions": [
      {
        "type": "Initialized",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:37:19Z"
      },
      {
        "type": "Ready",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:43:53Z"
      },
      {
        "type": "PodScheduled",
        "status": "True",
        "lastProbeTime": null,
        "lastTransitionTime": "2018-07-19T12:43:17Z"
      }
    ],
    "hostIP": "10.190.177.242",
    "podIP": "10.190.177.242",
    "startTime": "2018-07-19T12:37:19Z",
    "containerStatuses": [
      {
        "name": "gpu-exporter",
        "state": {
          "running": {
            "startedAt": "2018-07-19T12:43:17Z"
          }
        },
        "lastState": {},
        "ready": true,
        "restartCount": 0,
        "image": "openpai/gpu-exporter:latest",
        "imageID": "docker-pullable://openpai/gpu-exporter@sha256:29b7386e031ca914c104b403235e257599643af6ffb935206779ce8733429b93",
        "containerID": "docker://f58a89e49aa4c420d00e56b9453e4f566efcef5b715789689c52fc12525d3e86"
      },
      {
        "name": "node-exporter",
        "state": {
          "running": {
            "startedAt": "2018-07-19T12:40:01Z"
          }
        },
        "lastState": {},
        "ready": true,
        "restartCount": 0,
        "image": "prom/node-exporter:v0.16.0",
        "imageID": "docker-pullable://prom/node-exporter@sha256:55302581333c43d540db0e144cf9e7735423117a733cdec27716d87254221086",
        "containerID": "docker://afc8449cc9df4933e57f6671dc9e4b1ed029b919213ab81091a739c460b5c33e"
      }
    ],
    "qosClass": "Burstable"
  }
}
xudifsd commented 6 years ago

It seems #882 disabled uname export. Didn't know it will be used by grafana. Will fix this by adding this back.

xudifsd commented 6 years ago

@Beyyes can you try deploy that again?

Beyyes commented 6 years ago

@xudifsd Yes, it works.

But is it normal that no data points are shown when I submitted a job?

image

xudifsd commented 6 years ago

Is your job last long time? Or it's a short live job? Exporter will collect job metrics in every 30s, if it didn't last more than 30s, it will not have any data points.