Open ProkopHapala opened 1 year ago
I skethed the tricubic interpolation krenell here (it is using image read_imagef()
, maybe would be better version with buffer?)
float8 hspline_basis( float x ){
// hermite spline with derivatives https://en.wikipedia.org/wiki/Cubic_Hermite_spline
float x2 = x*x;
float K = x2*(x - 1);
float d0 = K - x2 + x; // x3 - x2
float d1 = K ; // x3 - 2*x2 + x
return (float8){
// ------ values
d0*-0.5,
2*K - x2 + 1 - d1*-0.5, // 2*x3 - 3*x2 + 1
-2*K + x2 + d0* 0.5, // -2*x3 + 3*x2
d0*0.5,
// ----- derivatives
d0*-0.5,
2*K - d1*-0.5, // 6*x2 - 6*x
-2*K + d0* 0.5, // -6*x2 + 6*x
d0*0.5
};
}
float4 interpolate_tricubic( __read_only image3d_t im, float4 p0 ){
float4 dx=(float4){1.f,0.f,0.f,0.f};
float4 dy=(float4){0.f,1.f,0.f,0.f};
float4 dz=(float4){0.f,0.f,1.f,0.f};
float4 iu; float4 u = fract(p0,&iu);
float8 cx = hspline_basis(u.x);
float8 cy = hspline_basis(u.y);
float4 p;
p=iu -dz -dy; float2 S00= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz ; float2 S01= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz +dy; float2 S02= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu -dz+dy+dy; float2 S03= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S0 = S00.xyx*cy.s04.xxy + S01.xyx*cy.s15.xxy + S02.xyx*cy.s26.xxy + S03.xyx*cy.s37.xxy;
p=iu -dy; float2 S10= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu ; float2 S11= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dy; float2 S12= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dy+dy; float2 S13= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S1 = S10.xyx*cy.s04.xxy + S11.xyx*cy.s15.xxy + S12.xyx*cy.s26.xxy + S13.xyx*cy.s37.xxy;
p=iu +dz -dy; float2 S20= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz ; float2 S21= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz -dy; float2 S22= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu +dz+dy+dy; float2 S23= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S2 = S20.xyx*cy.s04.xxy + S21.xyx*cy.s15.xxy + S22.xyx*cy.s26.xxy + S23.xyx*cy.s37.xxy;
p=iu+dz+dz -dy; float2 S30= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz ; float2 S31= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz +dy; float2 S32= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
p=iu+dz+dz+dy+dy; float2 S33= read_imagef(im,samp0,p-dx ).x*cx.s04 + read_imagef(im,samp0,p).x*cx.s15 + read_imagef(im,samp0,p+dx).x*cx.s26 + read_imagef(im,samp0,p+dx+dx).x*cx.s37;
float3 S3 = S30.xyx*cy.s04.xxy + S31.xyx*cy.s15.xxy + S32.xyx*cy.s26.xxy + S33.xyx*cy.s37.xxy;
float8 cz = hspline_basis(u.z);
return S0.xyzx*cz.s04.xxxy + S1.xyzx*cz.s15.xxxy + S2.xyzx*cz.s26.xxxy + S3.xyzx*cz.s37.xxxy;
}
I have written it here: https://github.com/ProkopHapala/FireCore/blob/3df8cd1755bc59b91ec29b73217f65aadba379e1/cpp/common_resources/cl/relax_multi.cl#L933
I'm not sure if it is working properly, and I don't have time to test it now. I don't know if @NikoOinonen have time and would be interested to test it soon? If not, that is fine, I will do it once I have more time.
I definitely think this would be interesting to try out, but probably I won't have the time for it any time soon.
I was wondering about the difference in access time between the buffer and image types, so I did some simple tests.
I wrote some simple kernels to square all the values in a 3D array both using the buffer and the image types:
__constant sampler_t sampler_1 = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void square_buffer_3d(
__global float *in,
__global float *out,
int4 nGrid
) {
int3 iL = (int3)(get_local_id(0), get_local_id(1), get_local_id(2));
int3 nG = (int3)(get_global_size(0), get_global_size(1), get_global_size(2));
int nyz = nGrid.y * nGrid.z;
int nz = nGrid.z;
int ind = iL.x * nyz + iL.y * nz + iL.z;
for (int i = iL.x; i < nGrid.x; i += nG.x) {
for (int j = iL.y; j < nGrid.y; j += nG.y) {
for (int k = iL.z; k < nGrid.z; k += nG.z) {
int ind = i * nyz + j * nz + k;
float x = in[ind];
out[ind] = x * x;
}
}
}
}
__kernel void square_image_3d(
__read_only image3d_t in,
__write_only image3d_t out,
int4 nGrid
) {
int3 iL = (int3)(get_local_id(0), get_local_id(1), get_local_id(2));
int3 nG = (int3)(get_global_size(0), get_global_size(1), get_global_size(2));
int nyz = nGrid.y * nGrid.z;
int nz = nGrid.z;
int ind = iL.x * nyz + iL.y * nz + iL.z;
for (int i = iL.x; i < nGrid.x; i += nG.x) {
for (int j = iL.y; j < nGrid.y; j += nG.y) {
for (int k = iL.z; k < nGrid.z; k += nG.z) {
int4 coord = (int4)(i, j, k, 0);
float4 x = read_imagef(in, sampler_1, coord);
write_imagef(out, coord, x * x);
}
}
}
}
Comparing the execution times on a 300x300x300 array, I found that the buffer version takes ~7ms and the image version takes ~1.8ms. So the image read/write functions seem to be much faster in this case.
However, for this simple case you could significantly simplify the buffer version by just using 1D indexing:
__kernel void square_buffer_1d(
__global float *in,
__global float *out,
int nGrid
) {
int iG = get_global_id(0);
int nG = get_global_size(0);
for (int i = iG; i < nGrid; i += nG) {
float x = in[i];
out[i] = x * x;
}
}
This one takes just ~1.3ms, which is faster still. If you try to do the same for the image type, you need to do a lot more arithmetic for the indices since the image type does not allow 1D indexing of multi-dimensional arrays:
__kernel void square_image_1d(
__read_only image3d_t in,
__write_only image3d_t out,
int4 nGrid
) {
int iG = get_global_id(0);
int nG = get_global_size(0);
int nxy = nGrid.y * nGrid.x;
int nxyz = nxy * nGrid.z;
for (int i = iG; i < nxyz; i += nG) {
int4 coord = (int4)(
i % nGrid.x,
(i / nGrid.x) % nGrid.y,
i / nxy,
0
);
float4 x = read_imagef(in, sampler_1, coord);
write_imagef(out, coord, x * x);
}
}
This one is much slower at ~5.3ms.
So I think the conclusion is that whether you should use the buffer or the image type depends on how you index the array. For the case of interpolation, we need to do quite a bit of arithmetic to get the interpolation coordinates either way, so I would imagine that the image type is faster simply for the access time.
However, for this simple case you could significantly simplify the buffer version by just using 1D indexing: This one takes just ~1.3ms, which is faster still. If you try to do the same for the image type, you need to do a lot more arithmetic for the indices since the image type does not allow 1D indexing of multi-dimensional arrays: So I think the conclusion is that whether you should use the buffer or the image type depends on how you index the array. For the case of interpolation, we need to do quite a bit of arithmetic to get the interpolation coordinates either way, so I would imagine that the image type is faster simply for the access time.
I think for sequential access there is not point using image. The question is random access (sampling at random location).
But when doing linear interpolation you can read 2 values sequentially and if doing cubic interpolation you can read 4 values sequentially
like this
float4 vals = (float4){ grid[ix+yz], grid[ix+1+yz],grid[ix+2+yz],grid[ix+3+yz] };
and than I'm not sure what is faster.
I was also thinking, is there actually a need to access the global memory so often? The probe doesn't move in very big steps, so most of the interpolated coordinates in a sequence fall between the same grid points, so in practice you are reading the same values from the global memory many times over.
If you stored the values from some particular 4x4x4 (or 2x2x2 for linear) grid points into shared memory, you should be able to reuse those many times before reading from global memory again. Moreover, adjacent scan positions overlap in the grid points that they read, so there should be some way of dividing the scans into blocks that execute in parallel, using the same shared memory values for faster access.
I was also thinking, is there actually a need to access the global memory so often? The probe doesn't move in very big steps, so most of the interpolated coordinates in a sequence fall between the same grid points, so in practice you are reading the same values from the global memory many times over.
If you stored the values from some particular 4x4x4 (or 2x2x2 for linear) grid points into shared memory, you should be able to reuse those many times before reading from global memory again. Moreover, adjacent scan positions overlap in the grid points that they read, so there should be some way of dividing the scans into blocks that execute in parallel, using the same shared memory values for faster access.
Yes, I think this is very good idea. I had the same idea some times ago, but consider it too complicated.
Currently in firecore I need to sample the grid at many positions (since I have big molecules interacting with the substrate), so this will not help to FireCore, but for ppafm it can give further speedup
I was also thinking, is there actually a need to access the global memory so often? The probe doesn't move in very big steps, so most of the interpolated coordinates in a sequence fall between the same grid points, so in practice you are reading the same values from the global memory many times over. If you stored the values from some particular 4x4x4 (or 2x2x2 for linear) grid points into shared memory, you should be able to reuse those many times before reading from global memory again. Moreover, adjacent scan positions overlap in the grid points that they read, so there should be some way of dividing the scans into blocks that execute in parallel, using the same shared memory values for faster access.
One more comment: if cubic interpolation allows larger grid-step this may be even more viable (you will cross the Voxel boundary less often)
but consider it too complicated.
For the linear interpolation case it might actually be very simple. You need just 2x2x2 = 8 numbers, which is few enough that it should just fit into the registers for the single thread, so you would not need to use shared memory at all.
For cubic 4x4x4 = 64 might not fit into the registers, because I was reading somewhere that for optimal performance you only get up to 32 registers per thread. And the compiler in the end decides what goes into registers what doesn't.
I was reading somewhere that for optimal performance you only get up to 32 registers per thread.
usefull information, I did not know that. This is maybe reason why my other kernels for classical molecular dynamics are less porformant than I expected.
Thre is something about it:
I cannot say I read and understant it completely, but is seems number of registers per thread depends on the number of threads per wrap (i.e. work_group_size
in OpenCL terminology)
the number of registers used by a kernel can have a significant impact on the number of resident warps. For example, for devices of compute capability 6.x, if a kernel uses 64 registers and each block has 512 threads and requires very little shared memory, then two blocks (i.e., 32 warps) can reside on the multiprocessor since they require 2x512x64 registers, which exactly matches the number of registers available on the multiprocessor. But as soon as the kernel uses one more register, only one block (i.e., 16 warps) can be resident since two blocks would require 2x512x65 registers, which are more registers than are available on the multiprocessor. Therefore, the compiler attempts to minimize register usage while keeping register spilling (see Device Memory Accesses) and the number of instructions to a minimum. Register usage can be controlled using the maxrregcount compiler option or launch bounds as described in Launch Bounds.
I cannot say I read and understant it completely, but is seems number of registers per thread depends on the number of threads per wrap (i.e. work_group_size in OpenCL terminology)
Warp is not the same as work group. A "work group" in OpenCL is the same as a "block" in Cuda. As far as I understand, a warp is something that is defined on hardware level, and on Nvidia it's always 32 threads in a warp, and the threads in the same warp always execute in parallel in exact lock step (which in some cases can be used to simplify the logic by not requiring thread syncs.) . If I recall "warp" is actually Nvidia specific term, and AMD has some other equivalent term on their hardware. I don't know if OpenCL has any equivalent of these since it's supposed to be hardware-agnostic.
I think I got that 32 registers per thread from here: https://forums.developer.nvidia.com/t/whats-the-max-register-number-that-causes-slowdown/234969. But that's speaking about a specific device, so it could depend on the device how many registers you can use while keeping the maximum number of threads active. There is some maximum number of registers per SM (= streaming multiprocessor), and each SM can execute some maximum number of threads at a time. So optimally (max number of threads per SM) x (registers per thread) = (max registers per SM).
The Cuda programming guide says that on Nvidia devices the hard maximum number of registers per thread is actually 255: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability. But it's up to the compiler to decide how many registers it considers optimal. In the Cuda compiler there seems to be this -maxrregcount
option for manually setting a maximum. Not sure if something similar exists in the OpenCL compilers.
Do you think that this would be significantly more efficient if we use cuda? Or is it just about setting workgroup size properly?
There are probably some things that you can optimize more for cuda, but I don't see why there would be a really significant jump in performance, and then we would restrict our code only to work on Nvidia GPUs. So personally, I would stick with OpenCL.
The Cuda programming guide says that on Nvidia devices the hard maximum number of registers per thread is actually 255: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability. But it's up to the compiler to decide how many registers it considers optimal. In the Cuda compiler there seems to be this
-maxrregcount
option for manually setting a maximum. Not sure if something similar exists in the OpenCL compilers.
There is also something:
2.6.2.2 Specifying the Default Work-Group Size at Compile-Time The number of registers used by a work-item is determined by the compiler on compile time. The user later specifies the size of the work-group. Ideally, the OpenCL compiler knows the size of the work-group at compile-time, so it can make optimal register allocation decisions. Without knowing the work-group size, the compiler must assume an upper-bound size to avoid allocating more registers in the work-item than the hardware actually contains. OpenCL provides a mechanism to specify a work-group size that the compiler can use to optimize the register allocation. In particular, specifying a smaller work-group size at compile time allows the compiler to allocate more registers for each kernel, which can avoid spill code and improve performance. The kernel attribute syntax is:
__attribute__((reqd_work_group_size(X, Y, Z)))
Section 6.7.2 of the OpenCL specification explains the attribute in more detail
https://www.amd.com/system/files/TechDocs/AMD_OpenCL_Programming_Optimization_Guide.pdf
From the documentation, A100 SM has 64K = 65536 registers, 2048 threads per SM, and the number of registers per thread is limited to 255. So, to have 2048 threads per SM, they must use only up to 32 registers. If you only have 256 threads per SM, they can use all 255 registers.
So I see the optimization issue:
you can use quite a lot of register, but at the cost of using less threads (some treads being idle).
Speaking about this, it would be quite nice to be able to write some non-uniform kernell, meaning that part of threads in the workgroup would do one sub-program, other part other sub-program, and at the end of the kernell they will synchoronize using local memory. Sure you can do it right now with if:
if( get_local_id(0) > 1 ){ tastk1(); }else{tastk2();}
but that will not save any registers I guess. It would be nice to be able to tell compiler that tastk1();
use many registers, and tastk2();
little registers.
Or maybe the compiler will figure this out?
I believe compiler will try to allocate registers evently (unifromly) between all threads ?
In particular, specifying a smaller work-group size at compile time allows the compiler to allocate more registers for each kernel, which can avoid spill code and improve performance. The kernel attribute syntax is:
__attribute__((reqd_work_group_size(X, Y, Z)))
This looks useful. We could pretty easily just try putting these things into all of our kernels and see if we get any performance improvements. I think we use group size 32 for almost all of the kernels by default. And you could still make the group size modifiable by the fact that we use pyopencl, so we could just substitute the group size into the source code in Python before compilation.
I believe compiler will try to allocate registers evently (unifromly) between all threads ?
As far as I understand (which is not that far...), every kernel has some fixed number of registers per thread, so that kind of non-uniform kernel is probably not optimal. If the two tasks are completely independent of one another, then I think you could divide it into two kernels and execute them in parallel? And then have a third kernel that gathers the result.
We could pretty easily just try putting these things into all of our kernels and see if we get any performance improvements. I think we use group size 32 for almost all of the kernels by default. And you could still make the group size modifiable by the fact that we use pyopencl, so we could just substitute the group size into the source code in Python before compilation.
yes, exactly that is what I was thinking about
If the two tasks are completely independent of one another, then I think you could divide it into two kernels and execute them in parallel? And then have a third kernel that gathers the result.
Yes, but what I was thinking that the threads doing different job would be able to synchronize over local memory rather than global memory. But probably that is not possible.
@NikoOinonen
If you want to implement tri-cubic interpolation maybe it would be useful to have a look on box-spline (as an alternativeto common tensor-product-base generalization of 1D splines to 3D)
This article actually show box-spline based interpolation on rectangular grid with less numerical artifacts than tensor-product based cubic interpoaltions (Fig. 7,8,9,10) https://sci-hub.se/https://ieeexplore.ieee.org/document/4015500
here is something general about box-splines (but considering mostly hexagonal grids) https://geom.ivd.kit.edu/downloads/pubs/pub-boehm-prautzsch_2002a.pdf
There is presentation related to the article http://vis.computer.org/vis2006/Vis2006/Papers/extensions_of_the_zwart_powell.pdf https://sci-hub.se/https://ieeexplore.ieee.org/document/4015500
The support of the tri-cubic B-spline is a 4×4×4 The support of this (Zwart-Powell) box spline is contained in 5×5×5 neighborhood. Originally we considered this box spline to be slower It turns out that only 53 points fall inside the support. (with respect to 64 for tensor-product 3D B-spline)
Here are some my notes to programing tricubic interpolation of GPU in FireCore
In short - There are 3 choices
I think it would be nice to try cubic interpolation.
On one hand it would require 64 (4x4x4) reads from memory rather than 8 (2x2x2) of tri-linear
but the advantage is that it would not need to calculate and save forces because C1 smooth forces can be calculated analytically from the cubic spline derivative. Therefore the arrays would be smaller (just
float
rather thanfloat4
).Further reduction of data-size will be probably achieved due to using less dense sampling step because cubic spline provides sufficient accuracy on larger range than linear spline.
Maybe the cost of global memory read can be reduced if we read together all the values which are in the same cashe-line. Using buffer it would look something like this:
but I'm not sure if it is really better to use buffer rather than image, because I have feeling texture reads are faster than buffer reads.
I was asking about this topic on stackoverflow, but did not get any uselful answer yet. https://stackoverflow.com/questions/75966359/efficient-tricubic-interpolation-of-3d-image-on-gpu-using-opencl
Originally posted by @ProkopHapala in https://github.com/Probe-Particle/ppafm/issues/142#issuecomment-1505436857