Closed GoogleCodeExporter closed 9 years ago
Essentially I'd like to see a toy example of linking in CUBLAS and have it run
against the ocelot emulator is some provable fashion. Maybe having a
deliberate out of bounds memory assignment and having ocelot catch it or
something. I'd want to see the command line that compiles and links said test
application.
Original comment by mcbromb...@gmail.com
on 21 Oct 2010 at 4:49
[deleted comment]
Try giving this simple example a go:
--- begin source code ---
/*!
\file simplerCUFFTdriver.cpp
\author Andrew Kerr <arkerr@gatech.edu>
\brief implements the simplerCUFFT application using the CUDA runtime API
*/
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
/*!
computes the N-point DFT of signal X and stores in Y
*/
bool host_dft(cuComplex *Y, cuComplex *X, float scale, int N) {
const float Pi = 3.14159265358979f;
for (int n = 0; n < N; n++) {
cuComplex X_n = {0, 0};
for (int k = 0; k < N; k++) {
float w = -2.0f * Pi / (float)N * (float)n * (float)k;
cuComplex e = {(float)cos(w), (float)sin(w)};
cuComplex x_k = X[k];
X_n.x += e.x * x_k.x - e.y * x_k.y;
X_n.y += e.y * x_k.x + e.x * x_k.y;
}
X_n.x *= scale;
X_n.y *= scale;
Y[n] = X_n;
}
return true;
}
/*!
computes the N-point DFT of signal X and stores in Y using CUDA's FFT library
*/
bool cuda_dft(cuComplex *Y, cuComplex *X, float scale, int N) {
size_t bytes = (size_t)N * sizeof(cuComplex);
cuComplex *Y_gpu, *X_gpu;
cudaMalloc((void **)&Y_gpu, bytes);
cudaMalloc((void **)&X_gpu, bytes);
cudaMemcpy(Y_gpu, Y, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(X_gpu, X, bytes, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan1d(&plan, N, CUFFT_C2C, 1);
cufftExecC2C(plan, X_gpu, Y_gpu, CUFFT_FORWARD);
cufftDestroy(plan);
cudaMemcpy(Y, Y_gpu, bytes, cudaMemcpyDeviceToHost);
cudaFree(Y_gpu);
cudaFree(X_gpu);
for (int n = 0; n < N; n++) {
Y[n].x *= scale;
Y[n].y *= scale;
}
return true;
}
int main(int argc, char *arg[]) {
int N = 32;
size_t bytes = (size_t)N * sizeof(cuComplex);
cuComplex *X, *Y_ref, *Y_exp;
X = (cuComplex *)malloc(bytes);
Y_ref = (cuComplex *)malloc(bytes);
Y_exp = (cuComplex *)malloc(bytes);
//
// initialize data
//
srand(2009);
for (int n = 0; n < N; n++) {
X[n].x = (float)(rand() % 1000) / 500.0f - 1.0f;
X[n].y = (float)((rand()+7) % 1000) / 500.0f - 1.0f;
Y_ref[n].x = 0;
Y_ref[n].y = 0;
Y_exp[n].x = 0;
Y_exp[n].y = 0;
}
//
// perform DFTs
//
host_dft(Y_ref, X, 1.0f, N);
cuda_dft(Y_exp, X, 1.0f, N);
//
// compare results
//
int errors = 0;
for (int n = 0; (errors < 10) && n < N; n++) {
cuComplex e = Y_ref[n], g = Y_exp[n];
cuComplex error = {e.x - g.x, e.y - g.y};
float mag_sq_error = error.x * error.x + error.y * error.y;
if (mag_sq_error >= 0.000001f) {
// error on element n
++errors;
printf("[n=%d]: expected (%f+%fi), got (%f+%fi)\n", n, e.x, e.y, g.x, g.y);
}
}
printf("DFT %s\n", (errors ? "FAILED" : "passed"));
//
// cleanup
//
free(X);
free(Y_ref);
free(Y_exp);
return 0;
}
--- end source code -----
To compile this example, make sure that ocelot is installed correctly:
normal@phenom:~/checkout/gpuocelot/tests/cuda2.2/tests/simplerCUFFT$
OcelotConfig -v
1.2.603
Compile the application, link against ocelot and cufft:
normal@phenom:~/checkout/gpuocelot/tests/cuda2.2/tests/simplerCUFFT$ g++
simplerCUFFT.cpp -o simplercufft -g -lcufft `OcelotConfig -l` -I
/usr/local/cuda/include/ -L /usr/local/cuda/lib64/
Run the application:
normal@phenom:~/checkout/gpuocelot/tests/cuda2.2/tests/simplerCUFFT$
./simplercufft
DFT passed
Verify that you are running on the emulator. Make sure that the debugger is
enabled in configure.ocelot and that the emulator is selected.
--- begin configure.ocelot ---
{
ocelot: "ocelot",
trace: {
memoryChecker: false,
raceDetector: {
enabled: false,
ignoreIrrelevantWrites: true
},
debugger: {
enabled: true,
kernelFilter: "",
alwaysAttach: true
}
},
cuda: {
implementation: CudaRuntime,
runtimeApiTrace: "trace/CudaAPI.trace"
},
executive: {
devices: [ emulated ],
optimizationLevel: none,
workerThreadLimit: 2
}
}
--- end configure.ocelot -----
Run the program again:
normal@phenom:~/checkout/gpuocelot/tests/cuda2.2/tests/simplerCUFFT$
./simplercufft
Step through the program using the emulator:
--- begin emulator output ---
normal@phenom:~/checkout/gpuocelot/tests/cuda2.2/tests/simplerCUFFT$
./simplercufft
(ocelot-dbg) Attaching debugger to kernel
'_Z20fftSmemRadix2_kernelP6float2S0_jf'
(ocelot-dbg) s
(0) - mov.u64 %r0, s_data
(ocelot-dbg) s
(1) - ld.param.f32 %r1,
[__cudaparm__Z20fftSmemRadix2_kernelP6float2S0_jf___val_paramphaseBase + 20]
(ocelot-dbg) s
(2) - ld.param.u32 %r2,
[__cudaparm__Z20fftSmemRadix2_kernelP6float2S0_jf_vectorSize + 16]
(ocelot-dbg) s
(3) - cvt.u64.u32 %r3, %tid.x
(ocelot-dbg) s
(4) - mov.s32 %r4, %tid.x
(ocelot-dbg) p a
(5) - mul.wide.u32 %r5, %r4, 8
(6) - cvt.u32.u64 %r6, %r3
(7) - mov.u32 %r7, %ctaid.x
(8) - mul.lo.u32 %r8, %r7, %r2
(9) - cvt.u64.u32 %r9, %r8
(10) - mul.wide.u32 %r10, %r8, 8
(11) - ld.param.u64 %r11,
[__cudaparm__Z20fftSmemRadix2_kernelP6float2S0_jf___val_paramd_Src + 8]
(12) - add.u64 %r12, %r11, %r10
(13) - cvt.u64.u32 %r13, %r6
(14) - mul.wide.u32 %r14, %r6, 8
(ocelot-dbg) p r
THREAD 0 THREAD 1 THREAD 2 THREAD 3 THREAD 4
R0 0 0 0 0
0
R1 be490fdb be490fdb be490fdb be490fdb
be490fdb
R2 20 20 20 20
20
R3 0 1 2 3
4
R4 0 1 2 3
4
R5 0 0 0 0
0
R6 0 0 0 0
0
R7 0 0 0 0
0
R8 0 0 0 0
0
R9 0 0 0 0
0
(ocelot-dbg)
--- end emulator output ----
Original comment by gregory....@gatech.edu
on 19 Nov 2010 at 1:46
Original issue reported on code.google.com by
mcbromb...@gmail.com
on 20 Oct 2010 at 10:53