XuehaiPan / nvitop

An interactive NVIDIA-GPU process viewer and beyond, the one-stop solution for GPU process management.
https://nvitop.readthedocs.io
Apache License 2.0
4.56k stars 144 forks source link

[Question] 建议修改 api/device.py line 2125 0.25s 改为 1s #91

Closed hui-zhao-1 closed 1 year ago

hui-zhao-1 commented 1 year ago

Required prerequisites

Questions

最近使用 master 分支做测试,发现还是存在监控丢失的现象,怀疑 api/device.py line 2125 这里的 0.25s 太小导致的采样丢失

测试环境: Ubuntu 0.04.6 LTS (Focal Fossa)

nvidia-smi
Fri Aug 18 09:20:13 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce GTX 1660 Ti     Off | 00000000:01:00.0 Off |                  N/A |
| 28%   47C    P2              40W / 120W |     75MiB /  6144MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A   1155688      C   ./nvitop-test                                72MiB |
+---------------------------------------------------------------------------------------+
pip list |grep nvidia-ml-py
nvidia-ml-py       12.535.77

具体测试方法和结果如下:

hui-zhao-1 commented 1 year ago

fork 代码以后,创建了下面四个分支: https://github.com/2581543189/nvitop/tree/250ms https://github.com/2581543189/nvitop/tree/500ms https://github.com/2581543189/nvitop/tree/750ms https://github.com/2581543189/nvitop/tree/1s

在测试机器上创建4 个conda 环境,并分别安装这四个环境的 nvitop,并通过 prometheus 收集这4 个环境的 process 监控,连续观察 12 个小时,结果如下: image image image image

可以看到,https://github.com/2581543189/nvitop/tree/1s 是几乎没有丢数据的分支

hui-zhao-1 commented 1 year ago

测试用的gpu程序如下:

#include <stdio.h>
#include<thread>
#include <chrono>
#include <iostream>
#include <cuda_runtime.h>

// nvcc nvitop-test.cu -o nvitop-test -std=c++11

void sleep(int milliseconds) {
        std::cout << "start sleep()" << milliseconds << " ms" << std::endl;
        auto start = std::chrono::high_resolution_clock::now();
        std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> elapsed = end - start;
        std::cout << "stop sleep(): " << elapsed.count() << " ms" << std::endl;
}

void initialData(float* ip, int size) {
        // generate different seed for random number
        time_t t;
        srand((unsigned)time(&t));
        for (int i = 0; i < size; i++) {
                ip[i] = (float)(rand() & 0xFF) / 10.0f;
        }
}

__global__ void testMaxFlopsKernel(float* pData, long nRepeats, float v1, float v2)
{
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        float s = pData[tid], s2 = 10.0f - s, s3 = 9.0f - s, s4 = 9.0f - s2;
        for (long i = 0; i < nRepeats; i++)
        {
                s = v1 - s * v2;
                s2 = v1 - s * v2;
                s3 = v1 - s2 * v2;
                s4 = v1 - s3 * v2;
        }
        pData[tid] = ((s + s2) + (s3 + s4));
}

const std::string currentDateTime() {
        time_t     now = time(0);
        struct tm  tstruct;
        char       buf[80];
        tstruct = *localtime(&now);
        // Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
        // for more information about date/time format
        strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct);

        return buf;
}

int main(int argc, char** argv) {
        // set up device
        int dev = 0;
        cudaSetDevice(dev);

        // set up data size of vectors
        int nElem = 1;
        printf("Vector size %d\n", nElem);
        long nRepeats = 1000000000;
        printf("nRepeats %ld\n", nRepeats);

        // malloc host memory
        size_t nBytes = nElem * sizeof(float);
        float* h_pData;
        h_pData = (float*)malloc(nBytes);

        // initialize data at host side
        initialData(h_pData, nElem);

        // malloc device global memory
        float* d_pData;
        cudaMalloc((float**)&d_pData, nBytes);

        // transfer data from host to device
        cudaMemcpy(d_pData, h_pData, nBytes, cudaMemcpyHostToDevice);

        // invoke kernel at host side
        dim3 block(1, 1, 1);
        dim3 grid(1, 1, 1);

        int index = 0;
        for (index = 0; index <= 10000000000; index++) {

                std::cout << "----------------------------------------------- "<< std::endl;
                std::cout << "start:" << currentDateTime() << std::endl;
                // std::cout << "start testMaxFlopsKernel()" << std::endl;
                auto start = std::chrono::steady_clock::now();
                testMaxFlopsKernel << < grid, block >> > (d_pData, nRepeats, 1.0f, 2.0f);
                cudaMemcpy(h_pData, d_pData, nBytes, cudaMemcpyDeviceToHost);
                auto end = std::chrono::steady_clock::now();
                auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                double time = ms.count();
                std::cout << "end:" << currentDateTime() << std::endl;
                std::cout << "during: " << time << " ms" << std::endl;
                sleep(5000);
        }
        cudaFree(d_pData);
        free(h_pData);
        return(0);
}

收集 prometheus 监控的代码如下: https://github.com/2581543189/nvitop/blob/250ms/nvitop/prometheus/cli.py https://github.com/2581543189/nvitop/blob/500ms/nvitop/prometheus/cli.py https://github.com/2581543189/nvitop/blob/750ms/nvitop/prometheus/cli.py https://github.com/2581543189/nvitop/blob/1s/nvitop/prometheus/cli.py

XuehaiPan commented 1 year ago

依据 man nvidia-smi

GPU Utilization 的采样周期为 1/6s ~ 1s,输出为该周期内的平均值。

image

nvidia-smi pmon 使用的 timestamp 值为上次采样时的时间戳。nvidia-smi pmon 的默认采样周期为 1s,输出为该周期内的平均值。

image

我会在下一个 PR 中做相关修改。