MuGdxy / muda

μ-Cuda, COVER THE LAST MILE OF CUDA. With features: intellisense-friendly, structured launch, automatic cuda graph generation and updating.
https://mugdxy.github.io/muda-doc/
Apache License 2.0
149 stars 7 forks source link

Problem #52

Closed Da1sypetals closed 3 months ago

Da1sypetals commented 3 months ago

Problem occured when I tried to create a simple example following the Zhihu tutorial:

main.cu:

#include <cstdio>
#include <muda/container.h>
#include <muda/muda.h>

#define MUDA_CHECK_ON 1
#define MUDA_WITH_COMPUTE_GRAPH 0

namespace md = muda;

int main() {
    constexpr int N = 1024;
    md::HostVector<float> hA(N), hB(N), hC(N);
    md::DeviceVector<float> dA(N), dB(N), dC(N);

    // initialize A and B using random numbers
    auto rand = [] { return std::rand() / (float)RAND_MAX; };
    std::generate(hA.begin(), hA.end(), rand);
    std::generate(hB.begin(), hB.end(), rand);

    // copy A and B to device
    dA = hA;
    dB = hB;

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    md::ParallelFor(256)
        .apply(
            N,
            [dC = dC.viewer(),        // | this is a capture list              |
             dA = dA.cviewer(),       // | map from device_vector to a viewer  |
             dB = dB.cviewer()]       // | which is the most muda-style part!  |
            __device__(int i) mutable // place "mutable" to make dC modifiable
            {
                // safe parallel for will cover the rang [0, N)
                // i just goes from 0 to N-1
                dC(i) = dA(i) + dB(i);
                md::print("%f\n", dC(i));
            })
        .wait(); // wait the kernel to finish

    // copy C back to host
    hC = dC;
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.15...3.26)
project(template LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)

# disable muda examples
set(MUDA_BUILD_EXAMPLE OFF CACHE BOOL "" FORCE)

add_subdirectory(extern/muda)

add_executable(main main.cu)

set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(main PUBLIC muda)

error:

[  0%] Built target muda
[ 33%] Building CUDA object CMakeFiles/main.dir/main.cu.o
/mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_view.inl(26): warning #20040-D: a __host__ function("muda::BufferViewT<IsConst, T>::BufferViewT") redeclared with __host__ __device__, hence treated as a __host__ __device__ function

/mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_2d_view.inl(45): warning #20040-D: a __host__ function("muda::Buffer2DViewT<IsConst, T>::as_const") redeclared with __host__ __device__, hence treated as a __host__ __device__ function

/mnt/a/dev/startup/main.cu(25): warning #177-D: variable "blocksPerGrid" was declared but never referenced

/mnt/a/dev/startup/extern/muda/src/muda/print.h(36): error: the global scope has no "printf"
      ::printf(fmt, print_check(print_convert(std::forward<Args>(arg)))...);
        ^

1 error detected in the compilation of "/mnt/a/dev/startup/main.cu".
make[2]: *** [CMakeFiles/main.dir/build.make:77: CMakeFiles/main.dir/main.cu.o] Error 2
make[1]: *** [CMakeFiles/Makefile2:100: CMakeFiles/main.dir/all] Error 2
make: *** [Makefile:91: all] Error 2

Problem resolved after I manually added #include <cstdio> on the top of src/muda/print.h. But I think solution just should not be like this. I wonder if this is a configuration mistake by myself? If so, how shall I configure a minimal project setting? If not, what should be done to correct this? Please do not refer me to the solidsim example, since that is kind of sophisticated to be a starting project template. Thanks a lot in advance.

Da1sypetals commented 3 months ago

Also, when I tried out of range exception, no termination was caused but wrong answer was given. code is like:


After many attempts this problem persists.

#include <cstdio>
#include <muda/container.h>
#include <muda/muda.h>

namespace md = muda;

int main() {
    constexpr int N = 1024;
    md::HostVector<float> hA(N), hB(N), hC(N);
    md::DeviceVector<float> dA(N), dB(N), dC(N);

    // initialize A and B using random numbers
    auto rand = [] { return std::rand() / (float)RAND_MAX; };
    std::generate(hA.begin(), hA.end(), rand);
    std::generate(hB.begin(), hB.end(), rand);

    // copy A and B to device
    dA = hA;
    dB = hB;

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    md::ParallelFor(256)
        .kernel_name("add")
        .apply(
            N,
            [dC = dC.viewer(),        // | this is a capture list              |
             dA = dA.cviewer(),       // | map from device_vector to a viewer  |
             dB = dB.cviewer()]       // | which is the most muda-style part!  |
            __device__(int i) mutable // place "mutable" to make dC modifiable
            {
                // safe parallel for will cover the rang [0, N)
                // i just goes from 0 to N-1
                dC(i + 10) = dA(i) + dB(i);
                md::print("%f\n", dC(i));
            })
        .wait(); // wait the kernel to finish

    // copy C back to host
    hC = dC;
}

The printed results are ALL zeros, like

0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
// omitted
MuGdxy commented 3 months ago

Problem occured when I tried to create a simple example following the Zhihu tutorial:

main.cu:

#include <cstdio>
#include <muda/container.h>
#include <muda/muda.h>

#define MUDA_CHECK_ON 1
#define MUDA_WITH_COMPUTE_GRAPH 0

namespace md = muda;

int main() {
    constexpr int N = 1024;
    md::HostVector<float> hA(N), hB(N), hC(N);
    md::DeviceVector<float> dA(N), dB(N), dC(N);

    // initialize A and B using random numbers
    auto rand = [] { return std::rand() / (float)RAND_MAX; };
    std::generate(hA.begin(), hA.end(), rand);
    std::generate(hB.begin(), hB.end(), rand);

    // copy A and B to device
    dA = hA;
    dB = hB;

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    md::ParallelFor(256)
        .apply(
            N,
            [dC = dC.viewer(),        // | this is a capture list              |
             dA = dA.cviewer(),       // | map from device_vector to a viewer  |
             dB = dB.cviewer()]       // | which is the most muda-style part!  |
            __device__(int i) mutable // place "mutable" to make dC modifiable
            {
                // safe parallel for will cover the rang [0, N)
                // i just goes from 0 to N-1
                dC(i) = dA(i) + dB(i);
                md::print("%f\n", dC(i));
            })
        .wait(); // wait the kernel to finish

    // copy C back to host
    hC = dC;
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.15...3.26)
project(template LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)

# disable muda examples
set(MUDA_BUILD_EXAMPLE OFF CACHE BOOL "" FORCE)

add_subdirectory(extern/muda)

add_executable(main main.cu)

set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(main PUBLIC muda)

error:

[  0%] Built target muda
[ 33%] Building CUDA object CMakeFiles/main.dir/main.cu.o
/mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_view.inl(26): warning #20040-D: a __host__ function("muda::BufferViewT<IsConst, T>::BufferViewT") redeclared with __host__ __device__, hence treated as a __host__ __device__ function

/mnt/a/dev/startup/extern/muda/src/muda/buffer/details/buffer_2d_view.inl(45): warning #20040-D: a __host__ function("muda::Buffer2DViewT<IsConst, T>::as_const") redeclared with __host__ __device__, hence treated as a __host__ __device__ function

/mnt/a/dev/startup/main.cu(25): warning #177-D: variable "blocksPerGrid" was declared but never referenced

/mnt/a/dev/startup/extern/muda/src/muda/print.h(36): error: the global scope has no "printf"
      ::printf(fmt, print_check(print_convert(std::forward<Args>(arg)))...);
        ^

1 error detected in the compilation of "/mnt/a/dev/startup/main.cu".
make[2]: *** [CMakeFiles/main.dir/build.make:77: CMakeFiles/main.dir/main.cu.o] Error 2
make[1]: *** [CMakeFiles/Makefile2:100: CMakeFiles/main.dir/all] Error 2
make: *** [Makefile:91: all] Error 2

Problem resolved after I manually added #include <cstdio> on the top of src/muda/print.h. But I think solution just should not be like this. I wonder if this is a configuration mistake by myself? If so, how shall I configure a minimal project setting? If not, what should be done to correct this? Please do not refer me to the solidsim example, since that is kind of sophisticated to be a starting project template. Thanks a lot in advance.


I think you are right, I forgot to add the to the <muda/print.h>. Can you add this and raise a PR?