nzinfo / gpuocelot

Automatically exported from code.google.com/p/gpuocelot
0 stars 0 forks source link

CUDA API Trace Generator #24

Closed GoogleCodeExporter closed 9 years ago

GoogleCodeExporter commented 9 years ago
Describe the New Feature:
Add an implementation of the CUDA runtime that records a trace of every
call made.

Which milestone does the feature belong to?
0.9.0

Which branch does the new feature go in?
Trunk

Original issue reported on code.google.com by gregory....@gatech.edu on 5 Sep 2009 at 7:13

GoogleCodeExporter commented 9 years ago
Added code for generating a trace file.  Here is a sample:

cudaRegisterFatBinary pointer = 0x8295480 time = 0.121332
cudaRegisterFunction handle = 0 hostFunction = 0x8052a90 deviceFunction =
_Z10testKernelI8RGBA32_2EvPT_S2_i deviceName = _Z10testKernelI8RGBA32_2EvPT_S2_i
threadLimit = -1 time = 0.00250602
cudaRegisterFunction handle = 0 hostFunction = 0x8052b70 deviceFunction =
_Z10testKernelI6RGBA32EvPT_S2_i deviceName = _Z10testKernelI6RGBA32EvPT_S2_i
threadLimit = -1 time = 9.29832e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8052c50 deviceFunction =
_Z10testKernelI5RGB32EvPT_S2_i deviceName = _Z10testKernelI5RGB32EvPT_S2_i
threadLimit = -1 time = 8.58307e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8052d30 deviceFunction =
_Z10testKernelI4LA32EvPT_S2_i deviceName = _Z10testKernelI4LA32EvPT_S2_i 
threadLimit
= -1 time = 8.34465e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8052e10 deviceFunction =
_Z10testKernelIjEvPT_S1_i deviceName = _Z10testKernelIjEvPT_S1_i threadLimit = 
-1
time = 8.34465e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8052ef0 deviceFunction =
_Z10testKernelI5RGBA8EvPT_S2_i deviceName = _Z10testKernelI5RGBA8EvPT_S2_i
threadLimit = -1 time = 7.86781e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8052fd0 deviceFunction =
_Z10testKernelI17RGBA32_misalignedEvPT_S2_i deviceName =
_Z10testKernelI17RGBA32_misalignedEvPT_S2_i threadLimit = -1 time = 8.34465e-06
cudaRegisterFunction handle = 0 hostFunction = 0x80530b0 deviceFunction =
_Z10testKernelI16RGB32_misalignedEvPT_S2_i deviceName =
_Z10testKernelI16RGB32_misalignedEvPT_S2_i threadLimit = -1 time = 8.58307e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8053190 deviceFunction =
_Z10testKernelI15LA32_misalignedEvPT_S2_i deviceName =
_Z10testKernelI15LA32_misalignedEvPT_S2_i threadLimit = -1 time = 8.10623e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8053270 deviceFunction =
_Z10testKernelI16RGBA8_misalignedEvPT_S2_i deviceName =
_Z10testKernelI16RGBA8_misalignedEvPT_S2_i threadLimit = -1 time = 8.34465e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8053350 deviceFunction =
_Z10testKernelItEvPT_S1_i deviceName = _Z10testKernelItEvPT_S1_i threadLimit = 
-1
time = 7.86781e-06
cudaRegisterFunction handle = 0 hostFunction = 0x8053430 deviceFunction =
_Z10testKernelIhEvPT_S1_i deviceName = _Z10testKernelIhEvPT_S1_i threadLimit = 
-1
time = 1.35899e-05
cudaGetDeviceCount devices = 2 time = 4.29153e-06
cudaGetDeviceProperties pointer = 0xbfe318dc device = 0 time = 1.09673e-05
cudaGetDeviceProperties pointer = 0xbfe318dc device = 1 time = 4.76837e-06
cudaSetDevice device = 0 time = 4.52995e-06
cudaMalloc pointer = 0xb77e0010 size = 500000 time = 5.14984e-05
cudaMalloc pointer = 0xb7765010 size = 500000 time = 2.57492e-05
cudaMemcpy destination = 0xb77e0010 source = 0xb78d6008 size = 500000 kind = 1 
time =
0.00121546
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000796795
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.57356e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 3.69549e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.66893e-05
cudaSetupArgument size = 4 arg0 = 32 arg1 = -95 arg2 = 7 arg3 = 0 offset = 8 
time =
1.09673e-05
cudaLaunch entry = 0x8053430 time = 2.69885
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.90735e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000905752
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000460148
cudaThreadSynchronize time = 1.90735e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.0252e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.38419e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.09673e-05
cudaSetupArgument size = 4 arg0 = -112 arg1 = -48 arg2 = 3 arg3 = 0 offset = 8 
time =
1.21593e-05
cudaLaunch entry = 0x8053350 time = 1.43816
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.66893e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000684261
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000485182
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.0252e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.3365e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.14441e-05
cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 
time =
9.77516e-06
cudaLaunch entry = 0x8053270 time = 2.113
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.90735e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000694752
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000464678
cudaThreadSynchronize time = 2.38419e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 9.77516e-06
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.38419e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.09673e-05
cudaSetupArgument size = 4 arg0 = 36 arg1 = -12 arg2 = 0 arg3 = 0 offset = 8 
time =
9.77516e-06
cudaLaunch entry = 0x8053190 time = 0.70875
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 2.14577e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000729322
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000462055
cudaThreadSynchronize time = 2.38419e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.00136e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.40803e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.12057e-05
cudaSetupArgument size = 4 arg0 = -62 arg1 = -94 arg2 = 0 arg3 = 0 offset = 8 
time =
9.53674e-06
cudaLaunch entry = 0x80530b0 time = 0.67153
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.90735e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000722647
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000483751
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.0252e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.31266e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.14441e-05
cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 
time =
1.00136e-05
cudaLaunch entry = 0x8052fd0 time = 0.656292
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 2.14577e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000687599
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000513077
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.00136e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.28882e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.07288e-05
cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 
time =
9.77516e-06
cudaLaunch entry = 0x8052ef0 time = 0.93977
cudaGetLastError error = 0 time = 3.09944e-06
cudaThreadSynchronize time = 2.14577e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.00067997
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000460386
cudaThreadSynchronize time = 1.90735e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.12057e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.3365e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.07288e-05
cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 
time =
9.77516e-06
cudaLaunch entry = 0x8052e10 time = 0.80118
cudaGetLastError error = 0 time = 2.86102e-06
cudaThreadSynchronize time = 1.66893e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000792742
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000489473
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 9.77516e-06
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.31266e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.14441e-05
cudaSetupArgument size = 4 arg0 = 36 arg1 = -12 arg2 = 0 arg3 = 0 offset = 8 
time =
9.77516e-06
cudaLaunch entry = 0x8052d30 time = 0.518435
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 2.14577e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.00070715
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000479698
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.0252e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.38419e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.12057e-05
cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 
time =
1.00136e-05
cudaLaunch entry = 0x8052c50 time = 0.405424
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.90735e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000710249
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000466108
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 9.77516e-06
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.38419e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.07288e-05
cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 
time =
1.00136e-05
cudaLaunch entry = 0x8052b70 time = 0.359554
cudaGetLastError error = 0 time = 4.29153e-06
cudaThreadSynchronize time = 2.38419e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000850439
cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.00043273
cudaThreadSynchronize time = 2.14577e-06
cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ 
= 1
sharedMem = 0 stream = 0 time = 1.0252e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 
time
= 2.40803e-05
cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 
time =
1.09673e-05
cudaSetupArgument size = 4 arg0 = 9 arg1 = 61 arg2 = 0 arg3 = 0 offset = 8 time 
=
1.00136e-05
cudaLaunch entry = 0x8052a90 time = 0.334061
cudaGetLastError error = 0 time = 3.33786e-06
cudaThreadSynchronize time = 1.90735e-06
cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 
time =
0.000709295
cudaFree pointer = 0xb77e0010 time = 0.000252247
cudaFree pointer = 0xb7765010 time = 0.000199795
cudaUnregisterFatBinary handle = 0 time = 0.0116422

Original comment by gregory....@gatech.edu on 7 Sep 2009 at 7:59