Closed AlexUnwinder closed 2 months ago
I used the following console app to reproduce that:
// GPUWaitGlitch.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include <iostream>
#include "Windows.h"
#include "PresentMonAPI.h"
#define MAX_FRAMES 1024
#define MAX_STEPS 10000
int main()
{
STARTUPINFO si;
ZeroMemory(&si, sizeof(STARTUPINFO));
si.cb = sizeof(STARTUPINFO);
PROCESS_INFORMATION pi;
ZeroMemory(&pi, sizeof(PROCESS_INFORMATION));
char szCmdLine[] = "Text3D_D3D9.exe";
if (CreateProcess(NULL, szCmdLine, NULL, NULL, TRUE, CREATE_NO_WINDOW, NULL, NULL, &si, &pi))
{
CloseHandle(pi.hProcess);
CloseHandle(pi.hThread);
PM_SESSION_HANDLE session;
if (pmOpenSession(&session) == PM_STATUS_SUCCESS)
{
if (pmStartTrackingProcess(session, pi.dwProcessId) == PM_STATUS_SUCCESS)
{
PM_QUERY_ELEMENT queryElements[]
{
{ PM_METRIC_CPU_BUSY , PM_STAT_NONE, 0, 0 },
{ PM_METRIC_CPU_WAIT , PM_STAT_NONE, 0, 0 },
{ PM_METRIC_GPU_LATENCY , PM_STAT_NONE, 0, 0 },
{ PM_METRIC_GPU_BUSY , PM_STAT_NONE, 0, 0 },
{ PM_METRIC_GPU_WAIT , PM_STAT_NONE, 0, 0 },
};
PM_FRAME_QUERY_HANDLE frameQuery;
uint32_t dwBlobSize = 0;
if (pmRegisterFrameQuery(session, &frameQuery, queryElements, _countof(queryElements), &dwBlobSize) == PM_STATUS_SUCCESS)
{
LPBYTE lpBlob = new BYTE[dwBlobSize * MAX_FRAMES];
for (DWORD dwStep=0; dwStep<MAX_STEPS; dwStep++)
{
uint32_t dwFrames = MAX_FRAMES;
do
{
PM_STATUS err;
err = pmConsumeFrames(frameQuery, pi.dwProcessId, lpBlob, &dwFrames);
if ((err == PM_STATUS_SUCCESS) ||
(err == PM_STATUS_NO_DATA))
{
if (dwFrames)
{
for (DWORD dwFrame=0; dwFrame<dwFrames; dwFrame++)
{
LPBYTE lpFrame = lpBlob + dwFrame * dwBlobSize;
double cpuBusy = *(double* )(lpFrame + queryElements[0].dataOffset);
double cpuWait = *(double* )(lpFrame + queryElements[1].dataOffset);
double gpuLatency = *(double* )(lpFrame + queryElements[2].dataOffset);
double gpuBusy = *(double* )(lpFrame + queryElements[3].dataOffset);
double gpuWait = *(double* )(lpFrame + queryElements[4].dataOffset);
std::cout << "Frametime : " << cpuBusy + gpuBusy << " ms, GPU wait : " << gpuWait << " ms\n";
}
}
}
else
std::cout << "Failed to consume frames!\n";
}
while (dwFrames == MAX_FRAMES);
Sleep(100);
}
delete[] lpBlob;
pmFreeFrameQuery(frameQuery);
}
else
std::cout << "Failed to register frame query!\n";
pmStopTrackingProcess(session, pi.dwProcessId);
}
else
std::cout << "Failed to start process tracking!\n";
pmCloseSession(session);
}
else
std::cout << "Failed to open session!\n";
}
else
std::cout << "Failed to spawn process!\n";
}
Thanks for reporting the issue and the sample source. We'll take a look at it.
I started attaching PresentMon 2.0 to RTSS and faced some weird issue with GPUWait/GPULatency metrics reporting abnormally high values under certain conditions. Inside my application I have both PresentMon service and PresentMon console application based data importing modes. When I import those metrics via console PresentMon 2.0 everything works as expected. However an attempt to read GPUWait/GPULatency via PresentMon 2.0 service and PresentMonAPI2.dll directly results in returning bogus data inside GPUWait/GPULatency sometimes, mostly under high framerate scenarios. I've created tiny console application which just spawns Text3D sample from old D3D9 SDK, which runs at 3-5K FPS framerate and triggers the issue pretty fast on my system:
Considering that in case of error it reports abnormally high values for both GPUWait/GPULatency and both of them are calculated using GPUStart timestamp difference, I'd assume that wrong value is possibly reported if this timestamp is uninitialized / zeroed due to some reason. What really puzzles me that I cannot reproduce that with PresentMon's own overlay, which also seem to communicate with service via PresentMonAPI2.dll. Probably in raw mode some additional filtering is still applied to data prior to displaying it?