Backward derivative too slow / incorrect

chinmay0301ucsd commented 8 months ago

Hi, I wrote some code in slang, which seems to execute the forward pass correctly. But when I execute the backward pass, GPU utils goes 100%, and system hangs. Specially when I try to print the derivative value. I am using slangpy as the python interface to call the slang kernel file (shown below).

static const int N = NUM_CTRL_PTS;
static const int c = DIM;
static const int N1 = c * (N- 1);

__generic<let C : int> 
struct MatrixG : IDifferentiable
{
    float vals[C];
}

int nCi(int n, int i) {
    if (i > n) return 0;
    if (i == 0 || i == n) return 1;
    if (i > n - i) i = n - i;

    int result = 1;
    for (int k = 1; k <= i; ++k) {
        result *= n - k + 1;
        result /= k;
    }

    return result;
}

int fact(int n) {
    int result = 1;
    for (int i = 1; i <= n; ++i) {
        result *= i;
    }
    return result;
}

[CudaDeviceExport]
[Differentiable]
matrix<float, N, c> compute_coeffs_device(DiffTensorView control_pts) {
    // Compute the coefficients a_i for t^i, for bezier polynomial \sum a_i . t^i
    matrix<float, N, c> coeffs;
    [MaxIters(c)]
    for (int k = 0; k < c; k++) {
        [MaxIters(N)]
        for (int j = 0; j < N; j++) {
            int nCj = fact(N - 1) / fact(N - 1 - j); // degree of the polynomial is N-1
            float sum = 0;
            [MaxIters(N)]
            for (int i = 0; i < N; i++) {
                if (i <= j) {
                    sum += pow(-1, i + j) * control_pts[i, k] / (fact(i) * fact(j - i));
                }
            }
            coeffs[j][k] = nCj * sum;
        }
    }
    return coeffs;
}

[CudaDeviceExport]
[Differentiable]
 MatrixG<N1 * N1> assemble_matrix_sdf(matrix<float, N,c> coeffs) {
    // Function to create the matrix whose determinant is to be evaluated to get the sdf
    // coeffs: Tensor (N,c)
    MatrixG<N1 * N1> mat;

    // Initializing
    [MaxIters(N1 * N1)]
    for (int i = 0; i < N1 * N1; i++)
        mat.vals[i] = 0.0;

    [MaxIters(N - 1)]
    for (int i = 0; i < N - 1; i++)
        [MaxIters(N - 1)]
    for (int j = 0; j < N; j++)
        [MaxIters(c)]
        for (int k = 0; k < c; k++)
        {
            mat.vals[(k * (N - 1) + i) * N1 + j + i] = coeffs[j][k];
        }
    return mat;
}

[AutoPyBindCUDA]
[CUDAKernel]
[Differentiable]
void bezier2D(DiffTensorView t, DiffTensorView control_pts, DiffTensorView output)
{
    // t (tensor Mx1) : indices between 0-1 to traverse across the Bezier curve
    // control_pts (Nx2): N - Degree of Bezier Curve 2D
    // Get the 'global' index of this thread.
    uint3 tIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();

    // If the thread index is beyond the input size, exit early.
    if (tIdx.x > t.size(0))
        return;
    [MaxIters(N - 1)]
    for (int i = 0; i <= N - 1; i++)
    {
        output[tIdx.x, 0] = output[tIdx.x, 0] + nCi(N - 1, i) * pow((1 - t[tIdx.x]), (N - 1 - i)) * pow(t[tIdx.x], i) * control_pts[i, 0];
        output[tIdx.x, 1] = output[tIdx.x, 1] + nCi(N - 1, i) * pow((1 - t[tIdx.x]), (N - 1 - i)) * pow(t[tIdx.x], i) * control_pts[i, 1];
    }
}

[AutoPyBindCUDA]
[CudaKernel]
[Differentiable]
void compute_coeffs(DiffTensorView control_pts, DiffTensorView output) {
    // Compute the coefficients a_i for t^i, for bezier polynomial \sum a_i . t^i
    matrix<float, N, c> coeffs = compute_coeffs_device(control_pts);
    for (int i = 0; i < N; i++)
        for (int j = 0; j < c; j++)
            output[i, j] = coeffs[i][j];
}

[AutoPyBindCUDA]
[CUDAKernel]
[Differentiable]
void bezier2DSDF(DiffTensorView xy, DiffTensorView control_pts, DiffTensorView output) {
    // xy - M,c
    // coeffs - N,c
    // output - M,N1,N1 - matrix for each point at which SDF is to be evaluated
    // coeffs - ,c
    // Each thread computes the SDF value for a given xy coordinate from the determinant function above. Maybe change it up to be just differentiable, and not AutoPyBindCUDA
    uint3 tIdx = cudaThreadIdx() + cudaBlockIdx() * cudaBlockDim();
    matrix<float, N, c> coeffs = compute_coeffs_device(control_pts);

    int M = xy.size(0); // xy - shaped M,2
    if (tIdx.x > M) {
        return;
    }

    float coord[c];
    for (int i = 0; i < c; i++)
        coord[i] = xy[tIdx.x, i];

    for (int i = 0; i < c; i++)
        coeffs[0][i] -= coord[i];

    MatrixG<N1 * N1> mat;
    mat = assemble_matrix_sdf(coeffs);
    for (int i = 0; i < N1; i++)
        for (int j = 0; j < N1; j++)
            output[tIdx.x, i, j] = mat.vals[i*N1 + j];
}

In the python code,

import torch
import slangpy

N = 6
c = 2 
m = slangpy.loadModule('bezier.slang', defines={"NUM_CTRL_PTS": N, "DIM":c})
num_pts = 1000
t = torch.linspace(0.0, 1, num_pts, dtype=torch.float).cuda()

control_pts = 1*torch.rand((N,2),dtype=torch.float).cuda()
control_pts[-1] = control_pts[0]

## Computing SDF for line function 
num_points = 1000 # for example, 100 points along each axis

# Generate evenly spaced points between 0 and 1
px = torch.linspace(0.2, 1.0, num_points)
py = torch.linspace(0.5, 1.0, num_points)

# Create the meshgrid
x, y = torch.meshgrid(px, py.flip(dims=[0]), indexing='ij')  # 'i
xy = torch.stack( [x,y], dim=-1 ).view(-1,2).cuda()
sdf_mats = torch.zeros(xy.shape[0], c*(N-1), c*(N-1)).cuda()

import time 

num_iters = 10
t = time.time()
for i in range(num_iters):
    m.bezier2DSDF(xy=xy, control_pts=control_pts, output=sdf_mats).launchRaw(blockSize=(1024, 1, 1), gridSize=(1024, 1, 1))
    sdf_N = torch.linalg.det(sdf_mats)
    sdf_N = torch.sign(sdf_N) * torch.sqrt(torch.abs(sdf_N))
print((time.time() - t)/num_iters)

## Computing Gradients
xy_grad = torch.zeros_like(xy).cuda()
control_pts_grad = torch.zeros_like(control_pts).cuda()
output_grad = torch.zeros_like(sdf_mats).cuda()
m.bezier2DSDF.bwd(xy=(xy, xy_grad), control_pts=(control_pts, control_pts_grad), output=(sdf_mats, output_grad)).launchRaw(blockSize=(1,1,1), gridSize=(1,1,1))

print(control_pts_grad) # This is where things hang.

When I call the .bwd function, it takes extremely long, and returns all 0s and NaNs. But the forward pass works correctly. My guess is that there's something wrong with how the derivatives are being chained through the MatrixG data structure.

saipraveenb25 commented 8 months ago

We discussed this off-channel, but I'll paste the solution here: for loops with iteration count known at compile-time, use [ForceUnroll] instead of [MaxIters(N)] for a much more performant kernel.

I just want to check that this resolves the extremely long run-time issue.

chinmay0301ucsd commented 8 months ago

Yes, that resolves the issue. Closing it.

shader-slang / slang-python

Backward derivative too slow / incorrect #21