Closed Da1sypetals closed 3 months ago
Also, when I tried out of range exception, no termination was caused but wrong answer was given. code is like:
After many attempts this problem persists.
#include <cstdio>
#include <muda/container.h>
#include <muda/muda.h>
namespace md = muda;
int main() {
constexpr int N = 1024;
md::HostVector<float> hA(N), hB(N), hC(N);
md::DeviceVector<float> dA(N), dB(N), dC(N);
// initialize A and B using random numbers
auto rand = [] { return std::rand() / (float)RAND_MAX; };
std::generate(hA.begin(), hA.end(), rand);
std::generate(hB.begin(), hB.end(), rand);
// copy A and B to device
dA = hA;
dB = hB;
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
md::ParallelFor(256)
.kernel_name("add")
.apply(
N,
[dC = dC.viewer(), // | this is a capture list |
dA = dA.cviewer(), // | map from device_vector to a viewer |
dB = dB.cviewer()] // | which is the most muda-style part! |
__device__(int i) mutable // place "mutable" to make dC modifiable
{
// safe parallel for will cover the rang [0, N)
// i just goes from 0 to N-1
dC(i + 10) = dA(i) + dB(i);
md::print("%f\n", dC(i));
})
.wait(); // wait the kernel to finish
// copy C back to host
hC = dC;
}
The printed results are ALL zeros, like
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
// omitted
Problem occured when I tried to create a simple example following the Zhihu tutorial:
main.cu:
#include <cstdio> #include <muda/container.h> #include <muda/muda.h> #define MUDA_CHECK_ON 1 #define MUDA_WITH_COMPUTE_GRAPH 0 namespace md = muda; int main() { constexpr int N = 1024; md::HostVector<float> hA(N), hB(N), hC(N); md::DeviceVector<float> dA(N), dB(N), dC(N); // initialize A and B using random numbers auto rand = [] { return std::rand() / (float)RAND_MAX; }; std::generate(hA.begin(), hA.end(), rand); std::generate(hB.begin(), hB.end(), rand); // copy A and B to device dA = hA; dB = hB; int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; md::ParallelFor(256) .apply( N, [dC = dC.viewer(), // | this is a capture list | dA = dA.cviewer(), // | map from device_vector to a viewer | dB = dB.cviewer()] // | which is the most muda-style part! | __device__(int i) mutable // place "mutable" to make dC modifiable { // safe parallel for will cover the rang [0, N) // i just goes from 0 to N-1 dC(i) = dA(i) + dB(i); md::print("%f\n", dC(i)); }) .wait(); // wait the kernel to finish // copy C back to host hC = dC; }
CMakeLists.txt:
cmake_minimum_required(VERSION 3.15...3.26) project(template LANGUAGES CXX CUDA) set(CMAKE_CXX_STANDARD 17) # disable muda examples set(MUDA_BUILD_EXAMPLE OFF CACHE BOOL "" FORCE) add_subdirectory(extern/muda) add_executable(main main.cu) set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(main PUBLIC muda)
error:
[ 0%] Built target muda [ 33%] Building CUDA object CMakeFiles/main.dir/main.cu.o /mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_view.inl(26): warning #20040-D: a __host__ function("muda::BufferViewT<IsConst, T>::BufferViewT") redeclared with __host__ __device__, hence treated as a __host__ __device__ function /mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_2d_view.inl(45): warning #20040-D: a __host__ function("muda::Buffer2DViewT<IsConst, T>::as_const") redeclared with __host__ __device__, hence treated as a __host__ __device__ function /mnt/a/dev/startup/main.cu(25): warning #177-D: variable "blocksPerGrid" was declared but never referenced /mnt/a/dev/startup/extern/muda/src/muda/print.h(36): error: the global scope has no "printf" ::printf(fmt, print_check(print_convert(std::forward<Args>(arg)))...); ^ 1 error detected in the compilation of "/mnt/a/dev/startup/main.cu". make[2]: *** [CMakeFiles/main.dir/build.make:77: CMakeFiles/main.dir/main.cu.o] Error 2 make[1]: *** [CMakeFiles/Makefile2:100: CMakeFiles/main.dir/all] Error 2 make: *** [Makefile:91: all] Error 2
Problem resolved after I manually added
#include <cstdio>
on the top ofsrc/muda/print.h
. But I think solution just should not be like this. I wonder if this is a configuration mistake by myself? If so, how shall I configure a minimal project setting? If not, what should be done to correct this? Please do not refer me to the solidsim example, since that is kind of sophisticated to be a starting project template. Thanks a lot in advance.
I think you are right, I forgot to add the
Problem occured when I tried to create a simple example following the Zhihu tutorial:
main.cu:
CMakeLists.txt:
error:
Problem resolved after I manually added
#include <cstdio>
on the top ofsrc/muda/print.h
. But I think solution just should not be like this. I wonder if this is a configuration mistake by myself? If so, how shall I configure a minimal project setting? If not, what should be done to correct this? Please do not refer me to the solidsim example, since that is kind of sophisticated to be a starting project template. Thanks a lot in advance.