Closed jwang323 closed 9 years ago
From gregory....@gatech.edu on September 07, 2009 12:59:30
Added code for generating a trace file. Here is a sample:
cudaRegisterFatBinary pointer = 0x8295480 time = 0.121332 cudaRegisterFunction handle = 0 hostFunction = 0x8052a90 deviceFunction = _Z10testKernelI8RGBA32_2EvPT_S2_i deviceName = _Z10testKernelI8RGBA32_2EvPT_S2_i threadLimit = -1 time = 0.00250602 cudaRegisterFunction handle = 0 hostFunction = 0x8052b70 deviceFunction = _Z10testKernelI6RGBA32EvPT_S2_i deviceName = _Z10testKernelI6RGBA32EvPT_S2_i threadLimit = -1 time = 9.29832e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8052c50 deviceFunction = _Z10testKernelI5RGB32EvPT_S2_i deviceName = _Z10testKernelI5RGB32EvPT_S2_i threadLimit = -1 time = 8.58307e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8052d30 deviceFunction = _Z10testKernelI4LA32EvPT_S2_i deviceName = _Z10testKernelI4LA32EvPT_S2_i threadLimit = -1 time = 8.34465e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8052e10 deviceFunction = _Z10testKernelIjEvPT_S1_i deviceName = _Z10testKernelIjEvPT_S1_i threadLimit = -1 time = 8.34465e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8052ef0 deviceFunction = _Z10testKernelI5RGBA8EvPT_S2_i deviceName = _Z10testKernelI5RGBA8EvPT_S2_i threadLimit = -1 time = 7.86781e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8052fd0 deviceFunction = _Z10testKernelI17RGBA32_misalignedEvPT_S2_i deviceName = _Z10testKernelI17RGBA32_misalignedEvPT_S2_i threadLimit = -1 time = 8.34465e-06 cudaRegisterFunction handle = 0 hostFunction = 0x80530b0 deviceFunction = _Z10testKernelI16RGB32_misalignedEvPT_S2_i deviceName = _Z10testKernelI16RGB32_misalignedEvPT_S2_i threadLimit = -1 time = 8.58307e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8053190 deviceFunction = _Z10testKernelI15LA32_misalignedEvPT_S2_i deviceName = _Z10testKernelI15LA32_misalignedEvPT_S2_i threadLimit = -1 time = 8.10623e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8053270 deviceFunction = _Z10testKernelI16RGBA8_misalignedEvPT_S2_i deviceName = _Z10testKernelI16RGBA8_misalignedEvPT_S2_i threadLimit = -1 time = 8.34465e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8053350 deviceFunction = _Z10testKernelItEvPT_S1_i deviceName = _Z10testKernelItEvPT_S1_i threadLimit = -1 time = 7.86781e-06 cudaRegisterFunction handle = 0 hostFunction = 0x8053430 deviceFunction = _Z10testKernelIhEvPT_S1_i deviceName = _Z10testKernelIhEvPT_S1_i threadLimit = -1 time = 1.35899e-05 cudaGetDeviceCount devices = 2 time = 4.29153e-06 cudaGetDeviceProperties pointer = 0xbfe318dc device = 0 time = 1.09673e-05 cudaGetDeviceProperties pointer = 0xbfe318dc device = 1 time = 4.76837e-06 cudaSetDevice device = 0 time = 4.52995e-06 cudaMalloc pointer = 0xb77e0010 size = 500000 time = 5.14984e-05 cudaMalloc pointer = 0xb7765010 size = 500000 time = 2.57492e-05 cudaMemcpy destination = 0xb77e0010 source = 0xb78d6008 size = 500000 kind = 1 time = 0.00121546 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000796795 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.57356e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 3.69549e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.66893e-05 cudaSetupArgument size = 4 arg0 = 32 arg1 = -95 arg2 = 7 arg3 = 0 offset = 8 time = 1.09673e-05 cudaLaunch entry = 0x8053430 time = 2.69885 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.90735e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000905752 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000460148 cudaThreadSynchronize time = 1.90735e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.0252e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.38419e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.09673e-05 cudaSetupArgument size = 4 arg0 = -112 arg1 = -48 arg2 = 3 arg3 = 0 offset = 8 time = 1.21593e-05 cudaLaunch entry = 0x8053350 time = 1.43816 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.66893e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000684261 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000485182 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.0252e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.3365e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.14441e-05 cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 time = 9.77516e-06 cudaLaunch entry = 0x8053270 time = 2.113 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.90735e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000694752 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000464678 cudaThreadSynchronize time = 2.38419e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 9.77516e-06 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.38419e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.09673e-05 cudaSetupArgument size = 4 arg0 = 36 arg1 = -12 arg2 = 0 arg3 = 0 offset = 8 time = 9.77516e-06 cudaLaunch entry = 0x8053190 time = 0.70875 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 2.14577e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000729322 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000462055 cudaThreadSynchronize time = 2.38419e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.00136e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.40803e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.12057e-05 cudaSetupArgument size = 4 arg0 = -62 arg1 = -94 arg2 = 0 arg3 = 0 offset = 8 time = 9.53674e-06 cudaLaunch entry = 0x80530b0 time = 0.67153 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.90735e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000722647 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000483751 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blo...
From gregory....@gatech.edu on September 07, 2009 12:59:30
...ckX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.0252e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.31266e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.14441e-05 cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 time = 1.00136e-05 cudaLaunch entry = 0x8052fd0 time = 0.656292 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 2.14577e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000687599 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000513077 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.00136e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.28882e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.07288e-05 cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 time = 9.77516e-06 cudaLaunch entry = 0x8052ef0 time = 0.93977 cudaGetLastError error = 0 time = 3.09944e-06 cudaThreadSynchronize time = 2.14577e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.00067997 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000460386 cudaThreadSynchronize time = 1.90735e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.12057e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.3365e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.07288e-05 cudaSetupArgument size = 4 arg0 = 72 arg1 = -24 arg2 = 1 arg3 = 0 offset = 8 time = 9.77516e-06 cudaLaunch entry = 0x8052e10 time = 0.80118 cudaGetLastError error = 0 time = 2.86102e-06 cudaThreadSynchronize time = 1.66893e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000792742 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000489473 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 9.77516e-06 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.31266e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.14441e-05 cudaSetupArgument size = 4 arg0 = 36 arg1 = -12 arg2 = 0 arg3 = 0 offset = 8 time = 9.77516e-06 cudaLaunch entry = 0x8052d30 time = 0.518435 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 2.14577e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.00070715 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000479698 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.0252e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.38419e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.12057e-05 cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 time = 1.00136e-05 cudaLaunch entry = 0x8052c50 time = 0.405424 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.90735e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000710249 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.000466108 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 9.77516e-06 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.38419e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.07288e-05 cudaSetupArgument size = 4 arg0 = 18 arg1 = 122 arg2 = 0 arg3 = 0 offset = 8 time = 1.00136e-05 cudaLaunch entry = 0x8052b70 time = 0.359554 cudaGetLastError error = 0 time = 4.29153e-06 cudaThreadSynchronize time = 2.38419e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000850439 cudaMemset devPtr = 0xb7765010 value = 0 bytes = 500000 time = 0.00043273 cudaThreadSynchronize time = 2.14577e-06 cudaConfigureCall gridX = 64 gridY = 1 gridZ = 1 blockX = 256 blockY = 1 blockZ = 1 sharedMem = 0 stream = 0 time = 1.0252e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 80 arg2 = 118 arg3 = -73 offset = 0 time = 2.40803e-05 cudaSetupArgument size = 4 arg0 = 16 arg1 = 0 arg2 = 126 arg3 = -73 offset = 4 time = 1.09673e-05 cudaSetupArgument size = 4 arg0 = 9 arg1 = 61 arg2 = 0 arg3 = 0 offset = 8 time = 1.00136e-05 cudaLaunch entry = 0x8052a90 time = 0.334061 cudaGetLastError error = 0 time = 3.33786e-06 cudaThreadSynchronize time = 1.90735e-06 cudaMemcpy destination = 0xb785b008 source = 0xb7765010 size = 500000 kind = 2 time = 0.000709295 cudaFree pointer = 0xb77e0010 time = 0.000252247 cudaFree pointer = 0xb7765010 time = 0.000199795 cudaUnregisterFatBinary handle = 0 time = 0.0116422
Status: Fixed
From gregory....@gatech.edu on September 05, 2009 15:13:20
Describe the New Feature: Add an implementation of the CUDA runtime that records a trace of every call made. Which milestone does the feature belong to? 0.9.0 Which branch does the new feature go in? Trunk
Original issue: http://code.google.com/p/gpuocelot/issues/detail?id=24