Why is my GPU version slower than CPU version

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem.

On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.***> wrote:

— Reply to this email directly, view it on GitHub https://github.com/flatironinstitute/finufft/issues/385, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.***>

-- *-------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.> -- *-------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

Here is my setup: CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB

All runs on a centos7 system

Dimension of data: [3458 36 24] <-> [256 256 24]

Time elapsed: finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s)

The code:

    int64_t m_iSamples = 3458;
    int64_t m_iSpokes = 36;
    int64_t m_iROSize = 256;
    int64_t m_iPESize = 256;
    int64_t m_iSPESize = 24;
    int64_t m_iCHSize = 1;

    std::shared_ptr<CFloat> KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize]);
    CFloat* pKspSrc = KspRank.get();

    std::shared_ptr<CFloat> ImgRank(new CFloat[m_iROSize * m_iPESize * m_iSPESize * m_iCHSize]);
    CFloat* pImgDes = ImgRank.get();
    
    std::shared_ptr<float> kTraj(new float[m_iSamples * m_iSpokes * 2]);
    float* m_iKtraj = kTraj.get();

    // My pre-saved data
    ReadBin(sFile + "ksp_3458x36x24.bin", pKspSrc, m_iSamples * m_iSpokes * m_iSPESize);
    ReadBin(sFile + "traj_3458x36x2.bin", m_iKtraj, m_iSamples * m_iSpokes * 2); //[-pi, pi)

    for (int64_t ii = 1;ii < m_iCHSize;++ii)
    {
        memcpy(pKspSrc + ii * m_iSamples * m_iSpokes * m_iSPESize, pKspSrc, sizeof(CFloat) * m_iSamples * m_iSpokes * m_iSPESize);
    }

    int iNumThread = std::min<int>((int)m_iSPESize * m_iCHSize, omp_get_max_threads());

    // CPU
    boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time();
    finufft_opts m_optsB;
    finufft_default_opts(&m_optsB);
    m_optsB.upsampfac = 2.0;
    m_optsB.nthreads = iNumThread;
    finufftf2d1many(m_iSPESize * m_iCHSize, m_iSamples * m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc,
        1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB);

    boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC;
    std::cout << "finufftf cpu    Elapsed    time=    " << durationC << std::endl;

    // GPU
    float* m_d_x, * m_d_y;
    cuFloatComplex* m_d_c, * m_d_fk;
    cufinufftf_plan m_dplan;
    cudaMalloc(&m_d_x, m_iSamples * m_iSpokes * sizeof(float));
    cudaMalloc(&m_d_y, m_iSamples * m_iSpokes * sizeof(float));
    cudaMalloc(&m_d_c, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex));
    cudaMalloc(&m_d_fk, m_iROSize * m_iPESize * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex));

    cufinufft_opts m_gpuopts;
    cufinufft_default_opts(&m_gpuopts);

    m_gpuopts.gpu_method = 1;
    m_gpuopts.gpu_binsizex = 32;
    m_gpuopts.gpu_binsizey = 32;

    boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time();
    cudaMemcpy(m_d_x, m_iKtraj, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(m_d_y, m_iKtraj + m_iSamples * m_iSpokes, m_iSamples * m_iSpokes * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(m_d_c, pKspSrc, m_iSamples * m_iSpokes * m_iSPESize * m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);

    int64_t nmodes[3];
    nmodes[0] = m_iROSize;
    nmodes[1] = m_iPESize;
    nmodes[2] = 1;
    int ier;
    boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1;
    std::cout << "cufinufftf HostToDevice    Elapsed    time=    " << duration1 << std::endl;

    boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time();
    ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts);

    ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL);

    ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk);

    ier = cufinufftf_destroy(m_dplan);
    boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2;
    std::cout << "cufinufftf running    Elapsed    time=    " << duration2 << std::endl;

    boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time();
    cudaMemcpy(pImgDes, m_d_fk, m_iROSize * m_iPESize * m_iSPESize* m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
    boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3;
    std::cout << "cufinufftf DeviceToHost    Elapsed    time=    " << duration3 << std::endl;

Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 36*3458, N1=N2=256. eps=1e-6.

Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data):

test/finufft2dmany_test 24 256 256 124488 1e-6
test 2d1 many vs repeated single: ------------------------------------
ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s
one mode: rel err in F[94,66] of trans#23 is 1.06e-07

I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning):

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT 0.00114 s
[time  ] cufinufft plan: 0.00991 s
[time  ] cufinufft setNUpts: 0.000204 s
[time  ] cufinufft exec: 0.0129 s
[time  ] cufinufft destroy: 0.000194 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s
(exec-only thoughput: 2.32e+08 NU pts/s)

./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT 0.00115 s
[time  ] cufinufft plan: 0.000575 s
[time  ] cufinufft setNUpts: 0.000282 s
[time  ] cufinufft exec: 0.0241 s
[time  ] cufinufft destroy: 0.000121 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s
(exec-only thoughput: 1.24e+08 NU pts/s)

So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake).

Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts?

FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed:

OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6
test 2d1 many vs repeated single: ------------------------------------
ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s
one mode: rel err in F[94,66] of trans#23 is 1.15e-07

Sometimes with CPU it's better to limit the threads to 1/core or even less.

PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application.

Upshot is you should get 5-10x over decent CPU with a decent GPU.

Does this help? Best, Alex

On Fri, Dec 8, 2023 at 12:15 AM qinshucheng @.***> wrote:

Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m-1773919210984028173> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385 https://github.com/flatironinstitute/finufft/issues/385>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.> -- *-------------------------------------------------------------------^`^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

Here is my setup: CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB

All runs on a centos7 system

Dimension of data: [3458 36 24] <-> [256 256 24]

Time elapsed: finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s)

The code:

int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1;
std::shared_ptr KspRank(new CFloat[m_iSamples * m_iSpokes * m_iSPESize
m_iCHSize]); CFloat* pKspSrc = KspRank.get();

std::shared_ptr ImgRank(new CFloat[m_iROSize m_iPESize m_iSPESize

m_iCHSize]); CFloat* pImgDes = ImgRank.get();

std::shared_ptr kTraj(new float[m_iSamples m_iSpokes 2]); float* m_iKtraj = kTraj.get();

// My pre-saved data
ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples
m_iSpokes * m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples

m_iSpokes * 2); //[-pi, pi)

for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii m_iSamples m_iSpokes m_iSPESize, pKspSrc, sizeof(CFloat) m_iSamples m_iSpokes m_iSPESize); }

int iNumThread = std::min((int)m_iSPESize * m_iCHSize, omp_get_max_threads());

// CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize m_iCHSize, m_iSamples m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples * m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB);

boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl;

// GPU float m_d_x, m_d_y; cuFloatComplex m_d_c, m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_y, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_c, m_iSamples m_iSpokes m_iSPESize m_iCHSize sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex));

cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts);

m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32;

boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples m_iSpokes sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples m_iSpokes, m_iSamples m_iSpokes sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples m_iSpokes m_iSPESize m_iCHSize * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);

int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl;

boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize * m_iCHSize, 1e-6, &m_dplan, &m_gpuopts);

ier = cufinufftf_setpts(m_dplan, m_iSamples * m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL);

ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk);

ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl;

boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl;

— Reply to this email directly, view it on GitHub https://github.com/flatironinstitute/finufft/issues/385#issuecomment-1846559232, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI . You are receiving this because you commented.Message ID: @.***>

If you only want 3 digits, and you pack together 96 slices not 24, you can get throughput of a billion NU pts/sec:

(base) ahb@ccmlin019 /mnt/home/ahb/numerics/finufft/build/test/cuda> ./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f
#modes = 65536, #inputs = 96, #NUpts = 124488
[time  ] dummy warmup call to CUFFT  0.00174 s
[time  ] cufinufft plan:         0.000557 s
[time  ] cufinufft setNUpts:         0.000191 s
[time  ] cufinufft exec:         0.0113 s
[time  ] cufinufft destroy:      0.000108 s
[gpu   ] 95th data one mode: rel err in F[94,66] is 0.00014
[totaltime] 1.22e+04 us, speed 9.83e+08 NUpts/s
                    (exec-only thoughput: 1.06e+09 NU pts/s)

Notice the setup time is now negligible vs the exec time. This is what you should be seeing (ignoring H2D, D2H, which you have already measured). Let us know what you find. Best, Alex

Thanks a lot! It helps!

Actually I got similar results from those tests, it seems the most time-comsuming steps are warming up call to CUFFT and planning in my setups, which I did not time them separately at the first time.

I run those tests on the same setup mentioned.

test/finufft2dmany_test 24 256 256 124488 1e-6
test 2d1 many vs repeated single: ------------------------------------
ntr=24: 124488 NU pts to (256,256) modes in 0.0617 s    4.84e+07 NU pts/s
one mode: rel err in F[94,66] of trans#23 is 1.06e-07

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.164 s
[time  ] cufinufft plan:                 0.0357 s
[time  ] cufinufft setNUpts:             0.000312 s
[time  ] cufinufft exec:                 0.0136 s
[time  ] cufinufft destroy:              0.00375 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 5.34e+04 us, speed 5.6e+07 NUpts/s
(exec-only thoughput: 2.19e+08 NU pts/s)

./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.165 s
[time  ] cufinufft plan:                 0.0355 s
[time  ] cufinufft setNUpts:             0.000328 s
[time  ] cufinufft exec:                 0.0261 s
[time  ] cufinufft destroy:              0.00375 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 6.46e-07
[totaltime] 6.57e+04 us, speed 4.55e+07 NUpts/s
(exec-only thoughput: 1.14e+08 NU pts/s)

Method=1 (GM-sort) is likely twice as fast as method=2(SM)

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 f
#modes = 65536, #inputs = 24, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.228 s
[time  ] cufinufft plan:                 0.0981 s
[time  ] cufinufft setNUpts:             0.000168 s
[time  ] cufinufft exec:                 0.00743 s
[time  ] cufinufft destroy:              0.00424 s
[gpu   ] 23th data one mode: rel err in F[94,66] is 9.74e-06
[totaltime] 1.1e+05 us, speed 2.72e+07 NUpts/s
(exec-only thoughput: 4.02e+08 NU pts/s)

And single-precision is about x2 faster.

./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f
#modes = 65536, #inputs = 96, #NUpts = 124488
[time  ] dummy warmup call to CUFFT      0.229 s
[time  ] cufinufft plan:                 0.0981 s
[time  ] cufinufft setNUpts:             0.000184 s
[time  ] cufinufft exec:                 0.0113 s
[time  ] cufinufft destroy:              0.00431 s
[gpu   ] 95th data one mode: rel err in F[94,66] is 0.00014
[totaltime] 1.14e+05 us, speed 1.05e+08 NUpts/s
(exec-only thoughput: 1.05e+09 NU pts/s)

When I set 96 slices not 24, the exec time is pretty the same with your results.

So the main problem is time spent warming up call to CUFFT and planning, which I am so confused. However, this two steps just need to do once in my application, it wont be a serious problem.

Thanks again!

Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 36*3458, N1=N2=256. eps=1e-6. Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data): test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning): ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00114 s [time ] cufinufft plan: 0.00991 s [time ] cufinufft setNUpts: 0.000204 s [time ] cufinufft exec: 0.0129 s [time ] cufinufft destroy: 0.000194 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s (exec-only thoughput: 2.32e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00115 s [time ] cufinufft plan: 0.000575 s [time ] cufinufft setNUpts: 0.000282 s [time ] cufinufft exec: 0.0241 s [time ] cufinufft destroy: 0.000121 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s (exec-only thoughput: 1.24e+08 NU pts/s) So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake). Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts? FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed: OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.15e-07 Sometimes with CPU it's better to limit the threads to 1/core or even less. PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application. Upshot is you should get 5-10x over decent CPU with a decent GPU. Does this help? Best, Alex On Fri, Dec 8, 2023 at 12:15 AM qinshucheng @.> wrote: … Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m-1773919210984028173> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385 <#385>>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.> -- -------------------------------------------------------------------^`^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 Here is my setup: CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB All runs on a centos7 system Dimension of data: [3458 36 24] <-> [256 256 24] Time elapsed: finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s) The code: int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1; std::shared_ptr KspRank(new CFloat[m_iSamples m_iSpokes m_iSPESize m_iCHSize]); CFloat pKspSrc = KspRank.get(); std::shared_ptr ImgRank(new CFloat[m_iROSize m_iPESize m_iSPESize m_iCHSize]); CFloat pImgDes = ImgRank.get(); std::shared_ptr kTraj(new float[m_iSamples m_iSpokes 2]); float m_iKtraj = kTraj.get(); // My pre-saved data ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples m_iSpokes m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples m_iSpokes 2); //[-pi, pi) for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii m_iSamples m_iSpokes m_iSPESize, pKspSrc, sizeof(CFloat) m_iSamples m_iSpokes m_iSPESize); } int iNumThread = std::min((int)m_iSPESize m_iCHSize, omp_get_max_threads()); // CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize m_iCHSize, m_iSamples m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB); boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl; // GPU float m_d_x, m_d_y; cuFloatComplex m_d_c, m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_y, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_c, m_iSamples m_iSpokes m_iSPESize m_iCHSize sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex)); cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts); m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32; boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples m_iSpokes sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples m_iSpokes, m_iSamples m_iSpokes sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples m_iSpokes m_iSPESize m_iCHSize sizeof(cuFloatComplex), cudaMemcpyHostToDevice); int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl; boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize m_iCHSize, 1e-6, &m_dplan, &m_gpuopts); ier = cufinufftf_setpts(m_dplan, m_iSamples m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL); ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk); ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl; boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::local_time() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl; — Reply to this email directly, view it on GitHub <#385 (comment)>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI . You are receiving this because you commented.Message ID: **@.**> -- -------------------------------------------------------------------~^`^~._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

Yeah, looks like your dummy (warmup) call to CUFFT is 0.2 s, very slow. I seems to remember old V100's having this problem. Could it be a cuda version issue? Maybe search re CUFFT. But, just insert such a dummy call at the start of your code. But your GPU plan times are 10x longer than mine. But as you say, you only have to plan once since the k-space points stay fixed as you iterate (I think!). Your execute times are same as mine, as you say - great! Let me know if you're able to achieve this speed in your MRI application (apart from H2D, D2H, which is inevitable, unless you keep the whole iteration on the GPU, which is the best way!). Cheers, Alex

On Mon, Dec 11, 2023 at 5:41 AM qinshucheng @.***> wrote:

Thanks a lot! It helps!

Actually I got similar results from those tests, it seems the most time-comsuming steps are warming up call to CUFFT and planning in my setups, which I did not time them separately at the first time.

I run those tests on the same setup mentioned.

test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0617 s 4.84e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d

modes = 65536, #inputs = 24, #NUpts = 124488

[time ] dummy warmup call to CUFFT 0.164 s [time ] cufinufft plan: 0.0357 s [time ] cufinufft setNUpts: 0.000312 s [time ] cufinufft exec: 0.0136 s [time ] cufinufft destroy: 0.00375 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 5.34e+04 us, speed 5.6e+07 NUpts/s (exec-only thoughput: 2.19e+08 NU pts/s)

./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d

modes = 65536, #inputs = 24, #NUpts = 124488

[time ] dummy warmup call to CUFFT 0.165 s [time ] cufinufft plan: 0.0355 s [time ] cufinufft setNUpts: 0.000328 s [time ] cufinufft exec: 0.0261 s [time ] cufinufft destroy: 0.00375 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 6.57e+04 us, speed 4.55e+07 NUpts/s (exec-only thoughput: 1.14e+08 NU pts/s)

Method=1 (GM-sort) is likely twice as fast as method=2(SM)

./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 f

modes = 65536, #inputs = 24, #NUpts = 124488

[time ] dummy warmup call to CUFFT 0.228 s [time ] cufinufft plan: 0.0981 s [time ] cufinufft setNUpts: 0.000168 s [time ] cufinufft exec: 0.00743 s [time ] cufinufft destroy: 0.00424 s [gpu ] 23th data one mode: rel err in F[94,66] is 9.74e-06 [totaltime] 1.1e+05 us, speed 2.72e+07 NUpts/s (exec-only thoughput: 4.02e+08 NU pts/s)

And single-precision is about x2 faster.

./cufinufft2dmany_test 1 1 256 256 96 0 124488 1e-3 1e-2 f

modes = 65536, #inputs = 96, #NUpts = 124488

[time ] dummy warmup call to CUFFT 0.229 s [time ] cufinufft plan: 0.0981 s [time ] cufinufft setNUpts: 0.000184 s [time ] cufinufft exec: 0.0113 s [time ] cufinufft destroy: 0.00431 s [gpu ] 95th data one mode: rel err in F[94,66] is 0.00014 [totaltime] 1.14e+05 us, speed 1.05e+08 NUpts/s (exec-only thoughput: 1.05e+09 NU pts/s)

When I set 96 slices not 24, the exec time is pretty the same with your results.

So the main problem is time spent warming up call to CUFFT and planning, which I am so confused. However, this two steps just need to do once in my application, it wont be a serious problem.

Thanks again!

Thanks! I did not attempt to understand your code. Main facts are: what is N (ie, N1, N2), M (# pts), ntransforms? But then from your 2d1 call code it looks like ntrans=24, M = 36*3458, N1=N2=256. eps=1e-6. Indeed on my 8-core Ryzen2 laptop I get similar CPU perf to your report (testing random data): test/finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.128 s 2.34e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.06e-07 I just tested the GPU for the same problem, on a A6000 (roughly a GTX 3090): we actually see that method=1 (GM-sort) is twice as fast as method=2 (SM). So I recommend the former. Note that at 2.3e8 NU pts/s it is 10x the above CPU throughput (after planning): ./cufinufft2dmany_test 1 1 256 256 24 0 124488 1e-6 1e-5 d

modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to

CUFFT 0.00114 s [time ] cufinufft plan: 0.00991 s [time ] cufinufft setNUpts: 0.000204 s [time ] cufinufft exec: 0.0129 s [time ] cufinufft destroy: 0.000194 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.32e+04 us, speed 1.29e+08 NUpts/s (exec-only thoughput: 2.32e+08 NU pts/s) ./cufinufft2dmany_test 2 1 256 256 24 0 124488 1e-6 1e-5 d #modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to CUFFT 0.00115 s [time ] cufinufft plan: 0.000575 s [time ] cufinufft setNUpts: 0.000282 s [time ] cufinufft exec: 0.0241 s [time ] cufinufft destroy: 0.000121 s [gpu ] 23th data one mode: rel err in F[94,66] is 6.46e-07 [totaltime] 2.5e+04 us, speed 1.19e+08 NUpts/s (exec-only thoughput: 1.24e+08 NU pts/s) So, the first task is for you to report the above tests on your hardware (eg, use FINUFFT_BUILD_TESTS=ON in cmake). Second task is for you to see why you're not getting the GPU thoughput that the tester reports. Is all your time taken in planning or setpts? FWIW a fancy desktop (16-core Xeon gold 6244) gets less than twice the laptop speed: OMP_NUM_THREADS=16 ./finufft2dmany_test 24 256 256 124488 1e-6 test 2d1 many vs repeated single: ------------------------------------ ntr=24: 124488 NU pts to (256,256) modes in 0.0743 s 4.02e+07 NU pts/s one mode: rel err in F[94,66] of trans#23 is 1.15e-07 Sometimes with CPU it's better to limit the threads to 1/core or even less. PS you can improve all the above timings a bit (maybe 40-70% faster) by going to single-precision. I just checked the GPU and it gets about 4e8 NU pts/s in single-prec. You should aim for this in your MRI application. Upshot is you should get 5-10x over decent CPU with a decent GPU. Does this help? Best, Alex On Fri, Dec 8, 2023 at 12:15 AM qinshucheng @.*

> wrote: … <#m8921214255399623482> Can you give us a reproducible example (or all your problem parameters, N, M, eps, etc) and your setup (type of GPU, type of CPU, etc)? It could be you are running too small a problem. … <#m-1773919210984028173> On Thu, Dec 7, 2023 at 10:49 PM qinshucheng @.> wrote: — Reply to this email directly, view it on GitHub <#385 https://github.com/flatironinstitute/finufft/issues/385 <#385 https://github.com/flatironinstitute/finufft/issues/385>>, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI https://github.com/notifications/unsubscribe-auth/ACNZRSRNNPQYAW4CYT6QCHDYIKE4NAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGAZTCOBYGA4DSOI . You are receiving this because you are subscribed to this thread.Message ID: @.> -- -------------------------------------------------------------------^^._.~' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942 Here is my setup: CPU: Intel(R) Xeon(R) Gold 6230R CPU @ 2.10GHz with 52 cores GPU: GeForce RTX3090 24GB All runs on a centos7 system Dimension of data: [3458 36 24] <-> [256 256 24] Time elapsed: finufft cpu : 0.119s cufinufft : 0.387s (including time costed by trans data HostToDevice-0.005s and DeviceToHost-0.002s) The code: int64_t m_iSamples = 3458; int64_t m_iSpokes = 36; int64_t m_iROSize = 256; int64_t m_iPESize = 256; int64_t m_iSPESize = 24; int64_t m_iCHSize = 1; std::shared_ptr KspRank(new CFloat[m_iSamples m_iSpokes m_iSPESize m_iCHSize]); CFloat pKspSrc = KspRank.get(); std::shared_ptr ImgRank(new CFloat[m_iROSize m_iPESize m_iSPESize m_iCHSize]); CFloat pImgDes = ImgRank.get(); std::shared_ptr kTraj(new float[m_iSamples m_iSpokes 2]); float m_iKtraj = kTraj.get(); // My pre-saved data ReadBin(sFilePathSubspace + "ksp_3458x36x24.bin", pKspSrc, m_iSamples m_iSpokes m_iSPESize); ReadBin(sFilePathSubspace + "traj_3458x36x2.bin", m_iKtraj, m_iSamples m_iSpokes 2); //[-pi, pi) for (int64_t ii = 1;ii < m_iCHSize;++ii) { memcpy(pKspSrc + ii m_iSamples m_iSpokes m_iSPESize, pKspSrc, sizeof(CFloat) m_iSamples m_iSpokes m_iSPESize); } int iNumThread = std::min((int)m_iSPESize m_iCHSize, omp_get_max_threads()); // CPU boost::posix_time::ptime tC = boost::posix_time::microsec_clock::local_time(); finufft_opts m_optsB; finufft_default_opts(&m_optsB); m_optsB.upsampfac = 2.0; m_optsB.nthreads = iNumThread; finufftf2d1many(m_iSPESize m_iCHSize, m_iSamples m_iSpokes, m_iKtraj, m_iKtraj + m_iSamples m_iSpokes, pKspSrc, 1, 1e-6, m_iROSize, m_iPESize, pImgDes, &m_optsB); boost::posix_time::time_duration durationC = boost::posix_time::microsec_clock::local_time() - tC; std::cout << "finufftf cpu Elapsed time= " << durationC << std::endl; // GPU float m_d_x, m_d_y; cuFloatComplex m_d_c, m_d_fk; cufinufftf_plan m_dplan; cudaMalloc(&m_d_x, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_y, m_iSamples m_iSpokes sizeof(float)); cudaMalloc(&m_d_c, m_iSamples m_iSpokes m_iSPESize m_iCHSize sizeof(cuFloatComplex)); cudaMalloc(&m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex)); cufinufft_opts m_gpuopts; cufinufft_default_opts(&m_gpuopts); m_gpuopts.gpu_method = 1; m_gpuopts.gpu_binsizex = 32; m_gpuopts.gpu_binsizey = 32; boost::posix_time::ptime t1 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(m_d_x, m_iKtraj, m_iSamples m_iSpokes sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_y, m_iKtraj + m_iSamples m_iSpokes, m_iSamples m_iSpokes

sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(m_d_c, pKspSrc, m_iSamples m_iSpokes m_iSPESize m_iCHSize sizeof(cuFloatComplex), cudaMemcpyHostToDevice); int64_t nmodes[3]; nmodes[0] = m_iROSize; nmodes[1] = m_iPESize; nmodes[2] = 1; int ier; boost::posix_time::time_duration duration1 = boost::posix_time::microsec_clock::local_time() - t1; std::cout << "cufinufftf HostToDevice Elapsed time= " << duration1 << std::endl; boost::posix_time::ptime t2 = boost::posix_time::microsec_clock::local_time(); ier = cufinufftf_makeplan(1, 2, nmodes, 1, m_iSPESize m_iCHSize, 1e-6, &m_dplan, &m_gpuopts); ier = cufinufftf_setpts(m_dplan, m_iSamples m_iSpokes, m_d_x, m_d_y, NULL, 0, NULL, NULL, NULL); ier = cufinufftf_execute(m_dplan, m_d_c, m_d_fk); ier = cufinufftf_destroy(m_dplan); boost::posix_time::time_duration duration2 = boost::posix_time::microsec_clock::local_time() - t2; std::cout << "cufinufftf running Elapsed time= " << duration2 << std::endl; boost::posix_time::ptime t3 = boost::posix_time::microsec_clock::local_time(); cudaMemcpy(pImgDes, m_d_fk, m_iROSize m_iPESize m_iSPESize m_iCHSize sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); boost::posix_time::time_duration duration3 = boost::posix_time::microsec_clock::localtime() - t3; std::cout << "cufinufftf DeviceToHost Elapsed time= " << duration3 << std::endl; — Reply to this email directly, view it on GitHub <#385 (comment)>, or unsubscribe < https://github.com/notifications/unsubscribe-auth/ACNZRSXALBUK2FNOPFJMTNLYIKO7TAVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBWGU2TSMRTGI> . You are receiving this because you commented.Message ID: @.**> -- -------------------------------------------------------------------~^^..' |\ Alex Barnett Center for Computational Mathematics, Flatiron Institute | \ http://users.flatironinstitute.org/~ahb 646-876-5942

— Reply to this email directly, view it on GitHub https://github.com/flatironinstitute/finufft/issues/385#issuecomment-1849786411, or unsubscribe https://github.com/notifications/unsubscribe-auth/ACNZRSUCUSGHQXPORSCR64LYI3PM7AVCNFSM6AAAAABAMCBRGKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQNBZG44DMNBRGE . You are receiving this because you commented.Message ID: @.***>

flatironinstitute / finufft

Why is my GPU version slower than CPU version #385

modes = 65536, #inputs = 24, #NUpts = 124488

modes = 65536, #inputs = 24, #NUpts = 124488

modes = 65536, #inputs = 24, #NUpts = 124488

modes = 65536, #inputs = 96, #NUpts = 124488

modes = 65536, #inputs = 24, #NUpts = 124488 [time ] dummy warmup call to