Open jaredhoberock opened 9 years ago
Reproducer:
#include <thrust/version.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#define STENCIL_SIZE 8000000
#define STENCIL_TRUE_LOCATIONS 32
struct is_true
{
__host__ __device__
bool operator()(const bool x)
{
return x;
}
};
int main()
{
//allocate stencil
thrust::host_vector<bool> hv(STENCIL_SIZE,false);
size_t stencil_locations[STENCIL_TRUE_LOCATIONS] = {467508,
1326441,
1541662,
1679866,
2234773,
2387355,
2653762,
3159732,
3226888,
3828014,
3887644,
3909417,
3924245,
4042273,
4150580,
4233776,
4425058,
4446033,
4484984,
4565655,
4629464,
4703190,
4836990,
4903165,
4910365,
5328271,
5483482,
5755194,
5781566,
5966710,
5987753,
7870669};
for (size_t i=0;i<STENCIL_TRUE_LOCATIONS;i++)
hv[stencil_locations[i]] = true;
//copy stencil to GPU
thrust::device_vector<bool> dv = hv;
//FIRST METHOD: inclusive_scan / upper_bound
thrust::device_vector<size_t> dscanned(STENCIL_SIZE);
thrust::inclusive_scan(dv.begin(), dv.end(), dscanned.begin());
thrust::counting_iterator<size_t> count_it(0);
//dindices will have the locations of the 'true' values of the stencil
thrust::device_vector<size_t> dindices1(STENCIL_TRUE_LOCATIONS);
thrust::upper_bound(dscanned.begin(),dscanned.end(),count_it,count_it+STENCIL_TRUE_LOCATIONS,dindices1.begin());
//copy back to host
thrust::host_vector<size_t> hindices1 = dindices1;
//SECOND METHOD: copy_if
thrust::device_vector<size_t> dindices2(STENCIL_TRUE_LOCATIONS);
thrust::copy_if(count_it, count_it+STENCIL_SIZE, dv.begin(), dindices2.begin(), is_true());
//copy back to host
thrust::host_vector<size_t> hindices2 = dindices2;
for (size_t i=0;i<STENCIL_TRUE_LOCATIONS;i++)
{
printf("i: %2u stencil_location: %8u hindices1: %8u hindices2: %8u\n",i,stencil_locations[i],hindices1[i],hindices2[i]);
}
printf("done\n");
return 0;
}
A slightly modified version of the above that prints hindices1-hindices2
, to make it easy to tell if there's a failure (last column != 0 is a failure):
#include <thrust/version.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#define STENCIL_SIZE 8000000
#define STENCIL_TRUE_LOCATIONS 32
struct is_true
{
__host__ __device__
bool operator()(const bool x)
{
return x;
}
};
int main()
{
//allocate stencil
thrust::host_vector<bool> hv(STENCIL_SIZE,false);
size_t stencil_locations[STENCIL_TRUE_LOCATIONS] = {467508,
1326441,
1541662,
1679866,
2234773,
2387355,
2653762,
3159732,
3226888,
3828014,
3887644,
3909417,
3924245,
4042273,
4150580,
4233776,
4425058,
4446033,
4484984,
4565655,
4629464,
4703190,
4836990,
4903165,
4910365,
5328271,
5483482,
5755194,
5781566,
5966710,
5987753,
7870669};
for (size_t i=0;i<STENCIL_TRUE_LOCATIONS;i++)
hv[stencil_locations[i]] = true;
//copy stencil to GPU
thrust::device_vector<bool> dv = hv;
//FIRST METHOD: inclusive_scan / upper_bound
thrust::device_vector<size_t> dscanned(STENCIL_SIZE);
thrust::inclusive_scan(dv.begin(), dv.end(), dscanned.begin());
thrust::counting_iterator<size_t> count_it(0);
//dindices will have the locations of the 'true' values of the stencil
thrust::device_vector<size_t> dindices1(STENCIL_TRUE_LOCATIONS);
thrust::upper_bound(dscanned.begin(),dscanned.end(),count_it,count_it+STENCIL_TRUE_LOCATIONS,dindices1.begin());
//copy back to host
thrust::host_vector<size_t> hindices1 = dindices1;
//SECOND METHOD: copy_if
thrust::device_vector<size_t> dindices2(STENCIL_TRUE_LOCATIONS);
thrust::copy_if(count_it, count_it+STENCIL_SIZE, dv.begin(), dindices2.begin(), is_true());
//copy back to host
thrust::host_vector<size_t> hindices2 = dindices2;
for (size_t i=0;i<STENCIL_TRUE_LOCATIONS;i++)
{
printf("i: %2u stencil_location: %8u hindices1: %8u hindices2: %8u diff: %8u\n",i,stencil_locations[i],hindices1[i],hindices2[i],hindices1[i]-hindices2[i]);
}
printf("done\n");
return 0;
}
Amusingly, even more broken in CUDA 9.0:
[00:46:04]:wash@chimaera:/home/wash/development/nvidia/bugs/thrust_github_701__inclusive_scan_produces_wrong_result_for_char:0:$ ./thrust_github_701.cuda80
i: 0 stencil_location: 467508 hindices1: 467508 hindices2: 467508 diff: 0
i: 1 stencil_location: 1326441 hindices1: 1326441 hindices2: 1326441 diff: 0
i: 2 stencil_location: 1541662 hindices1: 1541662 hindices2: 1541662 diff: 0
i: 3 stencil_location: 1679866 hindices1: 1679866 hindices2: 1679866 diff: 0
i: 4 stencil_location: 2234773 hindices1: 2234773 hindices2: 2234773 diff: 0
i: 5 stencil_location: 2387355 hindices1: 2387355 hindices2: 2387355 diff: 0
i: 6 stencil_location: 2653762 hindices1: 2653762 hindices2: 2653762 diff: 0
i: 7 stencil_location: 3159732 hindices1: 3159732 hindices2: 3159732 diff: 0
i: 8 stencil_location: 3226888 hindices1: 3226888 hindices2: 3226888 diff: 0
i: 9 stencil_location: 3828014 hindices1: 3828014 hindices2: 3828014 diff: 0
i: 10 stencil_location: 3887644 hindices1: 3887644 hindices2: 3887644 diff: 0
i: 11 stencil_location: 3909417 hindices1: 4042273 hindices2: 3909417 diff: 132856
i: 12 stencil_location: 3924245 hindices1: 4150580 hindices2: 3924245 diff: 226335
i: 13 stencil_location: 4042273 hindices1: 4233776 hindices2: 4042273 diff: 191503
i: 14 stencil_location: 4150580 hindices1: 4425058 hindices2: 4150580 diff: 274478
i: 15 stencil_location: 4233776 hindices1: 4446033 hindices2: 4233776 diff: 212257
i: 16 stencil_location: 4425058 hindices1: 4484984 hindices2: 4425058 diff: 59926
i: 17 stencil_location: 4446033 hindices1: 4703190 hindices2: 4446033 diff: 257157
i: 18 stencil_location: 4484984 hindices1: 4836990 hindices2: 4484984 diff: 352006
i: 19 stencil_location: 4565655 hindices1: 4903165 hindices2: 4565655 diff: 337510
i: 20 stencil_location: 4629464 hindices1: 5328271 hindices2: 4629464 diff: 698807
i: 21 stencil_location: 4703190 hindices1: 5483482 hindices2: 4703190 diff: 780292
i: 22 stencil_location: 4836990 hindices1: 5755194 hindices2: 4836990 diff: 918204
i: 23 stencil_location: 4903165 hindices1: 5781566 hindices2: 4903165 diff: 878401
i: 24 stencil_location: 4910365 hindices1: 5966710 hindices2: 4910365 diff: 1056345
i: 25 stencil_location: 5328271 hindices1: 5987753 hindices2: 5328271 diff: 659482
i: 26 stencil_location: 5483482 hindices1: 8000000 hindices2: 5483482 diff: 2516518
i: 27 stencil_location: 5755194 hindices1: 8000000 hindices2: 5755194 diff: 2244806
i: 28 stencil_location: 5781566 hindices1: 8000000 hindices2: 5781566 diff: 2218434
i: 29 stencil_location: 5966710 hindices1: 8000000 hindices2: 5966710 diff: 2033290
i: 30 stencil_location: 5987753 hindices1: 8000000 hindices2: 5987753 diff: 2012247
i: 31 stencil_location: 7870669 hindices1: 8000000 hindices2: 7870669 diff: 129331
done
[00:46:38]:wash@chimaera:/home/wash/development/nvidia/bugs/thrust_github_701__inclusive_scan_produces_wrong_result_for_char:0:$ ./thrust_github_701.cuda90
i: 0 stencil_location: 467508 hindices1: 467508 hindices2: 467508 diff: 0
i: 1 stencil_location: 1326441 hindices1: 8000000 hindices2: 1326441 diff: 6673559
i: 2 stencil_location: 1541662 hindices1: 8000000 hindices2: 1541662 diff: 6458338
i: 3 stencil_location: 1679866 hindices1: 8000000 hindices2: 1679866 diff: 6320134
i: 4 stencil_location: 2234773 hindices1: 8000000 hindices2: 2234773 diff: 5765227
i: 5 stencil_location: 2387355 hindices1: 8000000 hindices2: 2387355 diff: 5612645
i: 6 stencil_location: 2653762 hindices1: 8000000 hindices2: 2653762 diff: 5346238
i: 7 stencil_location: 3159732 hindices1: 8000000 hindices2: 3159732 diff: 4840268
i: 8 stencil_location: 3226888 hindices1: 8000000 hindices2: 3226888 diff: 4773112
i: 9 stencil_location: 3828014 hindices1: 8000000 hindices2: 3828014 diff: 4171986
i: 10 stencil_location: 3887644 hindices1: 8000000 hindices2: 3887644 diff: 4112356
i: 11 stencil_location: 3909417 hindices1: 8000000 hindices2: 3909417 diff: 4090583
i: 12 stencil_location: 3924245 hindices1: 8000000 hindices2: 3924245 diff: 4075755
i: 13 stencil_location: 4042273 hindices1: 8000000 hindices2: 4042273 diff: 3957727
i: 14 stencil_location: 4150580 hindices1: 8000000 hindices2: 4150580 diff: 3849420
i: 15 stencil_location: 4233776 hindices1: 8000000 hindices2: 4233776 diff: 3766224
i: 16 stencil_location: 4425058 hindices1: 8000000 hindices2: 4425058 diff: 3574942
i: 17 stencil_location: 4446033 hindices1: 8000000 hindices2: 4446033 diff: 3553967
i: 18 stencil_location: 4484984 hindices1: 8000000 hindices2: 4484984 diff: 3515016
i: 19 stencil_location: 4565655 hindices1: 8000000 hindices2: 4565655 diff: 3434345
i: 20 stencil_location: 4629464 hindices1: 8000000 hindices2: 4629464 diff: 3370536
i: 21 stencil_location: 4703190 hindices1: 8000000 hindices2: 4703190 diff: 3296810
i: 22 stencil_location: 4836990 hindices1: 8000000 hindices2: 4836990 diff: 3163010
i: 23 stencil_location: 4903165 hindices1: 8000000 hindices2: 4903165 diff: 3096835
i: 24 stencil_location: 4910365 hindices1: 8000000 hindices2: 4910365 diff: 3089635
i: 25 stencil_location: 5328271 hindices1: 8000000 hindices2: 5328271 diff: 2671729
i: 26 stencil_location: 5483482 hindices1: 8000000 hindices2: 5483482 diff: 2516518
i: 27 stencil_location: 5755194 hindices1: 8000000 hindices2: 5755194 diff: 2244806
i: 28 stencil_location: 5781566 hindices1: 8000000 hindices2: 5781566 diff: 2218434
i: 29 stencil_location: 5966710 hindices1: 8000000 hindices2: 5966710 diff: 2033290
i: 30 stencil_location: 5987753 hindices1: 8000000 hindices2: 5987753 diff: 2012247
i: 31 stencil_location: 7870669 hindices1: 8000000 hindices2: 7870669 diff: 129331
done
I'm pretty sure the issue here is the use of the wrong intermediate type, and the lack of an inclusive_scan with init overload in Thrust (we have one in C++17 for just this reason). I guess all that work on the type requirements for the algorithms paid off in the long run :p (D0571r1 for reference). I was already revamping inclusive_scan to deal with intermediate types properly; I'm pretty sure that will fix this.
Tracked internally by nvbug 2004711.
Originally reported here: https://groups.google.com/d/msg/thrust-users/X7-FEDtKfBo/4wVMgfGgBgAJ
Here's a self-contained example showing a bug with the latest Thrust (I've tried both the one included with Cuda 7.5 RC and the latest from the master branch of the repo which included a recent fix for inclusive_scan): https://gist.github.com/eglaser77/756e5a9234cf0f08a3fb.
I build it with the command:
/usr/local/cuda/bin/nvcc -arch=sm_30 thrust_test.cu -o thrust_test -I/usr/local/cuda/include -g -L/usr/local/cuda/lib64/ -lcuda -lcudart
Basically I am trying to get the locations of 'true' values in a stencil. The first method uses thrust::inclusive_scan followed by thrust::upper_bound. It works with host vectors but fails when run with device vectors on the GPU. The second method does a thrust::copy_if and works fine. I get the same results on a Quadro K2100M and a GeForce GTX 750 Ti.
Here's the output I get (hindices1 are from the inclusive_scan/upper_bound method; hindices2 are from copy_if):
i: 0 stencil_location: 467508 hindices1: 467508 hindices2: 467508 i: 1 stencil_location: 1326441 hindices1: 1326441 hindices2: 1326441 i: 2 stencil_location: 1541662 hindices1: 1541662 hindices2: 1541662 i: 3 stencil_location: 1679866 hindices1: 1679866 hindices2: 1679866 i: 4 stencil_location: 2234773 hindices1: 2234773 hindices2: 2234773 i: 5 stencil_location: 2387355 hindices1: 2387355 hindices2: 2387355 i: 6 stencil_location: 2653762 hindices1: 2653762 hindices2: 2653762 i: 7 stencil_location: 3159732 hindices1: 3159732 hindices2: 3159732 i: 8 stencil_location: 3226888 hindices1: 3226888 hindices2: 3226888 i: 9 stencil_location: 3828014 hindices1: 3828014 hindices2: 3828014 i: 10 stencil_location: 3887644 hindices1: 3887644 hindices2: 3887644 i: 11 stencil_location: 3909417 hindices1: 3909417 hindices2: 3909417 i: 12 stencil_location: 3924245 hindices1: 3924245 hindices2: 3924245 i: 13 stencil_location: 4042273 hindices1: 4233776 hindices2: 4042273 i: 14 stencil_location: 4150580 hindices1: 4446033 hindices2: 4150580 i: 15 stencil_location: 4233776 hindices1: 4484984 hindices2: 4233776 i: 16 stencil_location: 4425058 hindices1: 4836990 hindices2: 4425058 i: 17 stencil_location: 4446033 hindices1: 5328271 hindices2: 4446033 i: 18 stencil_location: 4484984 hindices1: 5483482 hindices2: 4484984 i: 19 stencil_location: 4565655 hindices1: 5755194 hindices2: 4565655 i: 20 stencil_location: 4629464 hindices1: 5781566 hindices2: 4629464 i: 21 stencil_location: 4703190 hindices1: 5987753 hindices2: 4703190 i: 22 stencil_location: 4836990 hindices1: 8000000 hindices2: 4836990 i: 23 stencil_location: 4903165 hindices1: 8000000 hindices2: 4903165 i: 24 stencil_location: 4910365 hindices1: 8000000 hindices2: 4910365 i: 25 stencil_location: 5328271 hindices1: 8000000 hindices2: 5328271 i: 26 stencil_location: 5483482 hindices1: 8000000 hindices2: 5483482 i: 27 stencil_location: 5755194 hindices1: 8000000 hindices2: 5755194 i: 28 stencil_location: 5781566 hindices1: 8000000 hindices2: 5781566 i: 29 stencil_location: 5966710 hindices1: 8000000 hindices2: 5966710 i: 30 stencil_location: 5987753 hindices1: 8000000 hindices2: 5987753 i: 31 stencil_location: 7870669 hindices1: 8000000 hindices2: 7870669
The problem appears to be in the inclusive_scan call. When I examine the values I see that it is not strictly increasing as I would expect. Printing out where the scanned values change I get the following:
i: 467508 hscanned[i]: 1 i: 1326441 hscanned[i]: 2 i: 1541662 hscanned[i]: 3 i: 1679866 hscanned[i]: 4 i: 2234773 hscanned[i]: 5 i: 2387355 hscanned[i]: 6 i: 2653762 hscanned[i]: 7 i: 3159732 hscanned[i]: 8 i: 3226888 hscanned[i]: 9 i: 3828014 hscanned[i]: 10 i: 3887644 hscanned[i]: 11 i: 3909417 hscanned[i]: 12 i: 3924245 hscanned[i]: 13 i: 4008960 hscanned[i]: 11 i: 4042273 hscanned[i]: 12 i: 4150580 hscanned[i]: 13 i: 4233776 hscanned[i]: 14 i: 4276224 hscanned[i]: 13 i: 4425058 hscanned[i]: 14 i: 4446033 hscanned[i]: 15 i: 4484984 hscanned[i]: 16 i: 4543488 hscanned[i]: 14 i: 4565655 hscanned[i]: 15 i: 4629464 hscanned[i]: 16 i: 4677120 hscanned[i]: 15 i: 4703190 hscanned[i]: 16 i: 4836990 hscanned[i]: 17 i: 4903165 hscanned[i]: 18 i: 4910365 hscanned[i]: 19 i: 4944384 hscanned[i]: 17 i: 5328271 hscanned[i]: 18 i: 5483482 hscanned[i]: 19 i: 5755194 hscanned[i]: 20 i: 5781566 hscanned[i]: 21 i: 5879808 hscanned[i]: 20 i: 5966710 hscanned[i]: 21 i: 5987753 hscanned[i]: 22 i: 6013440 hscanned[i]: 21 i: 7870669 hscanned[i]: 22