zhanglin-1993 commented 7 years ago

Hello: I'm trying to realize p-norm version of the ASD-POCS, and complete it with CUDA. the matlab code works well while the CUDA code comes kinds of errors all the time. There maybe comes NaN voxels, which you said maybe the problem of eps, but it worked nothing when I set eps=0.0001, and maybe came the problem of the input parameter couldn't be the single one, and sometimes, it reminded me that the MinP wasn't a available function although I had compiled and got the mexw64 file in the proper path. could you please tell me where the problem is? the only change of minP.mexw64 to the minTV.mexw64 is an additional parameter p. Here is the related files.

OS-ASD-POCS: f=minimizeP(f0,dtvg,ng,p);

minimizeP.m:

if nargin==1
dtvg=1;ng=30;p=1;
else 
  if nargin == 4
      dtvg=varargin{1};
      ng=varargin{2};
      p=varargin{3};
else
    error('Wrogn amount of inputs');
end
end
img=(permute(img,[3 2 1]));
img=minP(img,dtvg,ng,p);
img=(permute(img,[3 2 1]));
end

minP.cpp


#include "tmwtypes.h"
#include "mex.h"
#include <math.h>
#include <cmath>
#include "matrix.h"
#include "POCS_P.hpp"
#include <string.h>
void mexFunction(int nlhs , mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
///////// First check if the amount of imputs is rigth.
int maxIter;
float alpha;
float p;
if (nrhs==1){
    maxIter=100;
    alpha=15.0f;
    p=1.0f;
}
if (nrhs==2){
    mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparemter inputed");
}
if (nrhs==3){
    mexErrMsgIdAndTxt("err", "Only 2 POCS hyperparemter inputed");
}
if (nrhs>4){
    mexErrMsgIdAndTxt("err", "Too many imput argumets");
}
if (nrhs==4){
    size_t mrows = mxGetM(prhs[1]);
    size_t ncols = mxGetN(prhs[1]);
    if (mrows!=1 || ncols !=1)
            mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
    mrows = mxGetM(prhs[2]);
    ncols = mxGetN(prhs[2]);
    if (mrows!=1 || ncols !=1)
            mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
    mrows = mxGetM(prhs[3]);
    ncols = mxGetN(prhs[3]);
    if (mrows!=1 || ncols !=1)
            mexErrMsgIdAndTxt("err", "POCS parameters shoudl be 1x1");
    alpha= (float)(mxGetScalar(prhs[1]));
    maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
    p=(float)(mxGetScalar(prhs[3]));
}

// First input should be x from (Ax=b), or the image. mxArray const * const image = prhs[0]; mwSize const numDims = mxGetNumberOfDimensions(image);

// Image should be dim 3 if (numDims!=3){ mexErrMsgIdAndTxt("err", "Image is not 3D"); } // Now that input is ok, parse it to C data types. float const const imgaux = static_cast<float const >(mxGetData(image)); const mwSize *size_img= mxGetDimensions(image); //get size of image

float img = (float)malloc(size_img[0] size_img[1] size_img[2]* sizeof(float));

for(int i=0;i<size_img[0]size_img[1]size_img[2];i++) img[i]=(float)imgaux[i];

// Allocte output image float imgout = (float)malloc(size_img[0] size_img[1] size_img[2]* sizeof(float)); // call C function with the CUDA denoising

const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] }; pocs_p(img,imgout, alpha, imageSize, maxIter, p);

//prepareotputs plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL); float mxImgout =(float) mxGetPr(plhs[0]);

memcpy(mxImgout,imgout,size_img[0] size_img[1] size_img[2]*sizeof(float)); //free memory free(img); free(imgout); }

4. POCS_P.hpp:

ifndef POCS_P_HPP

define POCS_P_HPP

include "mex.h"

include "tmwtypes.h"

void pocs_p(const float img,float dst,float alpha,const long* image_size, int maxIter,float p);

endif


5. pocs_p.cu:

define MAXTHREADS 1024

include "POCS_P.hpp"

define cudaCheckErrors(msg) \

do { \ cudaError_t err = cudaGetLastError(); \ if (err != cudaSuccess) { \ mexPrintf("ERROR in: %s \n",msg);\ mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\ } \ } while (0)

// CUDA kernels //https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927 global void divideArrayScalar(float vec,float scalar,const size_t n) { unsigned long long i = (blockIdx.x blockDim.x) + threadIdx.x; for(; i<n; i+=gridDim.xblockDim.x) { vec[i]/=scalar; } } global void multiplyArrayScalar(float vec,float scalar,const size_t n) { unsigned long long i = (blockIdx.x blockDim.x) + threadIdx.x; for(; i<n; i+=gridDim.xblockDim.x) { vec[i]=scalar; } } global void substractArrays(float vec,float vec2,const size_t n) { unsigned long long i = (blockIdx.x blockDim.x) + threadIdx.x; for(; i<n; i+=gridDim.x*blockDim.x) { vec[i]-=vec2[i]; } }

__device__ __inline__
        void gradient(const float* u, float* grad,
        long z, long y, long x,
        long depth, long rows, long cols)
{
    unsigned long size2d = rows*cols;
    unsigned long long idx = z * size2d + y * cols + x;

    float uidx = u[idx];

    if ( z - 1 >= 0 && z<depth) {
        grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
    }

    if ( y - 1 >= 0 && y<rows){
        grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
    }

    if ( x - 1 >= 0 && x<cols) {
        grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
    }
}

__global__ void gradientTV(const float* f, float* dftv,
        long depth, long rows, long cols, float p){
    unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
    unsigned long long idx = z * rows * cols + y * cols + x;
    if ( x >= cols || y >= rows || z >= depth )
        return;

    float df[3] ={0,0,0};
    float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k}
    float dfj[3]={0,0,0};
    float dfk[3]={0,0,0};
    gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
    gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
    gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
    gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
    float eps=0.0001; //% avoid division by zero
    dftv[idx]=(df[0]+df[1]+df[2])*((pow(df[0] *df[0] +df[1] *df[1] +df[2] *df[2],(p-2)/2))+eps)
    -dfi[2]*((pow(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2],(p-2)/2))+eps)  
    -dfj[1]*((pow(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2],(p-2)/2))+eps)
    -dfk[0]*((pow(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2],(p-2)/2))+eps);

}

__device__ void warpReduce(volatile float *sdata, size_t tid) {
    sdata[tid] += sdata[tid + 32];
    sdata[tid] += sdata[tid + 16];
    sdata[tid] += sdata[tid + 8];
    sdata[tid] += sdata[tid + 4];
    sdata[tid] += sdata[tid + 2];
    sdata[tid] += sdata[tid + 1];
}

__global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
    extern __shared__ volatile float sdata[];
    //http://stackoverflow.com/a/35133396/1485872
    size_t tid = threadIdx.x;
    size_t i = blockIdx.x*blockDim.x + tid;
    size_t gridSize = blockDim.x*gridDim.x;
    float mySum = 0;
    float value=0;
    while (i < n) {
        value=g_idata[i]; //avoid reading twice
        mySum += value*value;
        i += gridSize;
    }
    sdata[tid] = mySum;
    __syncthreads();

    if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid <  64)
        sdata[tid] += sdata[tid + 64];
    __syncthreads();

if (__CUDA_ARCH__ >= 300)

    if ( tid < 32 )
    {
        mySum = sdata[tid] + sdata[tid + 32];
        for (int offset = warpSize/2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }

else

    if (tid < 32) {
        warpReduce(sdata, tid);
        mySum = sdata[0];
    }

endif

    if (tid == 0) g_odata[blockIdx.x] = mySum;
}
__global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
    extern __shared__ volatile float sdata[];
    //http://stackoverflow.com/a/35133396/1485872
    size_t tid = threadIdx.x;
    size_t i = blockIdx.x*blockDim.x + tid;
    size_t gridSize = blockDim.x*gridDim.x;
    float mySum = 0;
   // float value=0;
    while (i < n) {
        mySum += g_idata[i];
        i += gridSize;
    }
    sdata[tid] = mySum;
    __syncthreads();

    if (tid < 512)
        sdata[tid] += sdata[tid + 512];
    __syncthreads();
    if (tid < 256)
        sdata[tid] += sdata[tid + 256];
    __syncthreads();

    if (tid < 128)
        sdata[tid] += sdata[tid + 128];
    __syncthreads();

    if (tid <  64)
        sdata[tid] += sdata[tid + 64];
    __syncthreads();

if (__CUDA_ARCH__ >= 300)

    if ( tid < 32 )
    {
        mySum = sdata[tid] + sdata[tid + 32];
        for (int offset = warpSize/2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }

else

    if (tid < 32) {
        warpReduce(sdata, tid);
        mySum = sdata[0];
    }

endif

    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

// main function void pocs_p(const float img,float dst,float alpha,const long* image_size, int maxIter,float p){

    size_t total_pixels = image_size[0] * image_size[1]  * image_size[2] ;
    size_t mem_size = sizeof(float) * total_pixels;

    float *d_image, *d_dimgTV,*d_norm2aux,*d_norm2;
    // memory for image
    cudaMalloc(&d_image, mem_size);
    cudaCheckErrors("Malloc Image error");
    cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
    cudaCheckErrors("Memory Malloc and Memset: SRC");
    // memory for df
    cudaMalloc(&d_dimgTV, mem_size);
    cudaCheckErrors("Memory Malloc and Memset: TV");

    cudaMalloc(&d_norm2, mem_size);
    cudaCheckErrors("Memory Malloc and Memset: TV");

    // memory for L2norm auxiliar
    cudaMalloc(&d_norm2aux, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
    cudaCheckErrors("Memory Malloc and Memset: NORMAux");

    // For the gradient
    dim3 blockGrad(10, 10, 10);
    dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z);

    // For the reduction
    float sumnorm2;

    for(unsigned int i=0;i<maxIter;i++){

        // Compute the gradient of the TV norm
        gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0],p);
        cudaCheckErrors("Gradient");

        cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
        cudaCheckErrors("Copy from gradient call error");
        // Compute the L2 norm of the gradint. For that, reduction is used.
        //REDUCE
        size_t dimblockRed = MAXTHREADS;
        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_norm2aux, total_pixels);
        cudaCheckErrors("reduce1");
        if (dimgridRed > 1) {
            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2aux, d_norm2, dimgridRed);
            cudaCheckErrors("reduce2");
            cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy");

        }
        else {
            cudaMemcpy(&sumnorm2, d_norm2aux, sizeof(float), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy");
        }
        //mexPrintf("%f ",sqrt(sumnorm2));
        //NOMRALIZE
        //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
        divideArrayScalar  <<<60,MAXTHREADS>>>(d_dimgTV,sqrt(sumnorm2),total_pixels);
        cudaCheckErrors("Division error");
        //MULTIPLY HYPERPARAMETER
        multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha,   total_pixels);
        cudaCheckErrors("Multiplication error");
        //SUBSTRACT GRADIENT
        substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
        cudaCheckErrors("Substraction error");
        sumnorm2=0;
    }

    cudaCheckErrors("TV minimization");

    cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
    cudaCheckErrors("Copy result back");

    cudaFree(d_image);
    cudaFree(d_norm2aux);
    cudaFree(d_dimgTV);
    cudaFree(d_norm2);

    cudaCheckErrors("Memory free");

}

zhanglin-1993 commented 7 years ago

Here is the related code, include the matlab code: gradientpnorm.m and other 4 cuda codes. minp.zip

AnderBiguri commented 7 years ago

Hi @ilanzhang . I will try to have a look at this, but it is very likely I will not be able to until January.

If you want to debug it to try to find where NaNs appear, the best way would be printing some values of the image aftear each step in the cuda code, and triying to see which line exactly has the NaNs. That will help pinpoint the source of errors.

zhanglin-1993 commented 6 years ago

Thanks for your reply. I'll try it after finishing my job.

AnderBiguri commented 6 years ago

@ilanzhang any updates on this?

zhanglin-1993 commented 6 years ago

@AnderBiguri have been using the matlab version, and the cuda version haven't been fixed, i don't know where the wrong is.

AnderBiguri commented 6 years ago

@ilanzhang hum, it may be helpful if you show me the maths if you still need help. The TV norm minimization is in the end defined by a derivation of the discrete gradient of the TV norm. Do you have a generic solution for the discrete gradient of the P norm?

zhanglin-1993 commented 6 years ago

@AnderBiguri Here is the file. Thanks for your help! J20170702_p-norm calculation - to AnderBiguri.docx

AnderBiguri commented 6 years ago

Hi @ilanzhang,

Thanks for this! Interesting approach. You mention that you have submitted a paper based on this? If that is the case, then the best would be to hopefully get that accepted, then we can have a look at how to include it in TIGRE or properly make the CUDA work, if that is OK with you!

zhanglin-1993 commented 5 years ago

@AnderBiguri very glad that the paper been accepted. if there is time for you, could you add the above code to work in CUDA?

zhanglin-1993 commented 5 years ago

@AnderBiguri and another thing confusing me is that: I use matlab 2018b, VS2015 and CUDA 9.0 in Win7 ultimate now. after compile, it shows that VS2015 was comfigured for the compilation of C, while what show next is, error using mex, can not find the compiler supporting. Do you know what's wrong here?

AnderBiguri commented 5 years ago

@ilanzhang Congratulations! Could you send me the code by email and a small demo of reconstruction using it? That would help me greatly, as if you share it I plan to modify it to be able to run on multiple GPUs.

I'd say the most likely case of your error is that you did not install the C++ compiler when installing VS2015, its not set up by default. Here you have a detailed installation and debugging help: https://github.com/CERN/TIGRE/blob/frontispiece/Frontispiece/MATLAB_installation.md

AnderBiguri commented 4 years ago

@ilanzhang I know this is super late, but just letting you know I have not forgotten! Unfortunately implementing this for me in CUDA is something that is too much to take right now, so I am not sure when I will be able to do it.

zhanglin-1993 commented 4 years ago

@AnderBiguri That's so nice of you, I'm sure this part will not only useful for me, but also useful for others!

zhanglin-1993 commented 4 years ago

@AnderBiguri there's a small suggestion here: when you add those part some day, if the regularization part could be designed to be combined freely, for example, I could apply the L1 norm of the gradient, or L2 norm of the gradient, or L0.8 norm of the gradient, and even I can choose the (L1-L2) norm of the garient. there're some new theories for regularization those years, its expansibility will be better for TIGRE if it's designed so.

CERN / TIGRE

modifying minTV to minP when calculating p norm of the magnitude gradient images #67

ifndef POCS_P_HPP

define POCS_P_HPP

include "mex.h"

include "tmwtypes.h"

endif

define MAXTHREADS 1024

include "POCS_P.hpp"

define cudaCheckErrors(msg) \

if (__CUDA_ARCH__ >= 300)

else

endif

if (__CUDA_ARCH__ >= 300)

else

endif