src-d / kmcuda

Large scale K-means and K-nn implementation on NVIDIA GPU / CUDA
Other
797 stars 146 forks source link

cuda build issues on Windows #50

Open pavlexander opened 6 years ago

pavlexander commented 6 years ago

Hi,

I am having troubles building CUDA files.

Following prerequisites were done:

Errors are following:

  1. File: metric_abctraction.h Code:
    FPATTR static typename HALF<F>::type distance(
        F sqr1 __attribute__((unused)), F sqr2 __attribute__((unused)), F prod) {
        float fp = _float(_fin(prod));
        if (fp >= 1.f) return _half<F>(0.f);
        if (fp <= -1.f) return _half<F>(M_PI);
        return _half<F>(acos(fp));
    }

Error:

c:\users\lucky\source\repos\core\core3\metric_abstraction.h(172): error : expected a ")"

Comment: line with ((unused))

  1. File: metric_abctraction.h Code:
    
    FPATTR static void normalize(uint32_t count __attribute__((unused)), float *vec) {
        // Kahan summation with inverted c
        float norm = 0, corr = 0;
    #pragma unroll 4
        for (int f = 0; f < d_features_size; f++) {
            float v = vec[f];
            float y = _fma(corr, v, v);
            float t = norm + y;
            corr = y - (t - norm);
            norm = t;
        }
        norm = _reciprocal(_sqrt(norm));

pragma unroll 4

    for (int f = 0; f < d_features_size; f++) {
        vec[f] = vec[f] * norm;
    }
}
Error:
> c:\users\lucky\source\repos\core\core3\metric_abstraction.h(255): error : expected a ")"

Comment: 
line with `((unused))`

3. File: `wrappers.h`
Code:

template class unique_devptr : public unique_devptr_parent { public: explicit unique_devptr(T ptr, bool fake = false) : unique_devptr_parent( ptr, fake ? [](T) {} : [](T *p) { cudaFree(p); }) {} };


Error:

> 1>            c:\users\lucky\source\repos\core\core3\wrappers.h(20): error : more than one operator "?" matches these operands:
> 1>                        built-in operator "expression ? pointer : pointer"
> 1>                        built-in operator "expression ? pointer : pointer"
> 1>                        operand types are: lambda [](float *)->void : lambda [](float *)->void
> 1>                      detected during:
> 1>                        instantiation of "unique_devptr<T>::unique_devptr(T *, __nv_bool) [with T=float]"

Comment: 
line with `ptr, fake ? [](T*) {} : [](T *p) { cudaFree(p); }) {}`

4. File: metric_abstraction.h
Code:
FPATTR static float distance_t(const F *__restrict__ v1, const F *__restrict__ v2,
    uint64_t v1_size, uint64_t v1_index) {
    // Kahan summation with inverted c
    F prod = _const<F>(0), corr = _const<F>(0);

pragma unroll 4

    for (uint64_t f = 0; f < d_features_size; f++) {
        F yprod = _fma(corr, v1[v1_size * f + v1_index], v2[f]);
        F tprod = _add(prod, yprod);
        corr = _sub(yprod, _sub(tprod, prod));
        prod = tprod;
    }
    return _float(distance(_const<F>(1), _const<F>(1), prod));
}

Error:

> 1>            c:\users\lucky\source\repos\core\core3\metric_abstraction.h(203): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list
> 1>                        argument types are: (float, float, float)
> 1>                      detected during:
> 1>                        instantiation of "float METRIC<(KMCUDADistanceMetric)1, F>::distance_t(const F *, const F *, uint64_t, uint64_t) [with F=float]"

Comment:
line with `return _float(distance(_const<F>(1), _const<F>(1), prod));`

Similar errors in same file:

> 1>            C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(269): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list
> 1>                        argument types are: (float, float, float)
> 1>                      detected during instantiation of "void kmeans_assign_lloyd_smallc<M,F>(uint32_t, uint32_t, const F *, const F *, uint32_t *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]"
> 1>            (954): here
> 1>
> 1>            C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(342): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list
> 1>                        argument types are: (float, float, float)
> 1>                      detected during instantiation of "void kmeans_assign_lloyd<M,F>(uint32_t, uint32_t, const F *, const F *, uint32_t *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]"
> 1>            (954): here

5. File: same
Code:

FPATTR static void normalize(uint32_t count attribute((unused)), float *vec) { // Kahan summation with inverted c float norm = 0, corr = 0;

pragma unroll 4

    for (int f = 0; f < d_features_size; f++) {
        float v = vec[f];
        float y = _fma(corr, v, v);
        float t = norm + y;
        corr = y - (t - norm);
        norm = t;
    }
    norm = _reciprocal(_sqrt(norm));

pragma unroll 4

    for (int f = 0; f < d_features_size; f++) {
        vec[f] = vec[f] * norm;
    }
}
Error:

> 1>            c:\users\lucky\source\repos\core\core3\metric_abstraction.h(260): error : identifier "vec" is undefined
> 1>                      detected during:
> 1>                        instantiation of "void METRIC<(KMCUDADistanceMetric)1, F>::normalize(uint32_t) [with F=float]"

6. File: kmeans.cu
Code:

template <KMCUDADistanceMetric M, typename F> global void kmeans_adjust( const uint32_t coffset, const uint32_t length, const F restrict samples, const uint32_t restrict assignments_prev, const uint32_t restrict assignments, F restrict centroids, uint32_t restrict ccounts) { uint32_t c = blockIdx.x blockDim.x + threadIdx.x; if (c >= length) { return; } c += coffset; uint32_t my_count = ccounts[c]; { F fmy_count = _const(my_count); centroids += c d_features_size; for (int f = 0; f < d_features_size; f++) { centroids[f] = _mul(centroids[f], fmy_count); } } extern shared uint32_t ass[]; int step = d_shmem_size / 2; F corr = _const(0); for (uint32_t sbase = 0; sbase < d_samples_size; sbase += step) { __syncthreads(); if (threadIdx.x == 0) { int pos = sbase; for (int i = 0; i < step && sbase + i < d_samples_size; i++) { ass[2 i] = assignments[pos + i]; ass[2 i + 1] = assignments_prev[pos + i]; } } __syncthreads(); for (int i = 0; i < step && sbase + i < d_samples_size; i++) { uint32_t this_ass = ass[2 i]; uint32_t prev_ass = ass[2 * i + 1]; int sign = 0; if (prev_ass == c && this_ass != c) { sign = -1; my_count--; } else if (prev_ass != c && this_ass == c) { sign = 1; my_count++; } if (sign != 0) { F fsign = _const(sign);

pragma unroll 4

    for (uint64_t f = 0; f < d_features_size; f++) {
      F centroid = centroids[f];
      F y = _fma(corr,
                 samples[static_cast<uint64_t>(d_samples_size) * f + sbase + i],
                 fsign);
      F t = _add(centroid, y);
      corr = _sub(y, _sub(t, centroid));
      centroids[f] = t;
    }
  }
}

} // my_count can be 0 => we get NaN with L2 and never use this cluster again // this is a feature, not a bug METRIC<M, F>::normalize(my_count, centroids); ccounts[c] = my_count; }


Error:

> 1>            C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(427): error : too many arguments in function call
> 1>                      detected during instantiation of "void kmeans_adjust<M,F>(uint32_t, uint32_t, const F *, const uint32_t *, const uint32_t *, F *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]"
> 1>            (1002): here

Comment: error on line `METRIC<M, F>::normalize(my_count, centroids);`

7. File: `tricks.cuh`
Code:

device forceinline uint32_t atomicAggInc(uint32_t *ctr) { int mask = ballot(1); int leader = __ffs(mask) - 1; uint32_t res; if ((threadIdx.x % warpSize) == leader) { res = atomicAdd(ctr, popc(mask)); } res = shfl(res, leader); return res + popc(mask & ((1 << (threadIdx.x % warpSize)) - 1)); }


Error:

> 1>                C:/Users/Lucky/source/repos/Core/Core3/tricks.cuh(31): error : identifier "shfl" is undefined

Fix for error 7: After I have added `#include "private.h"` into file - errors went away.. I wonder why wasn't this change committed? It seems like a very obvious fix and yet master branch does not include it.

Is CUDA project supposed to work at this moment? I would appreciate to hear out comments about errors that I have encountered.

Thanks in advance
vmarkovtsev commented 6 years ago

compiling c++ on windows

All the errors are obviously specific to MSVC++. The thing is, building this project has never been tested on Windows actually. I can add an Appveyor integration provided by somebody fixes these incompatibilities. Some of them are easy to fix, e.g. __attribute__, some of them are puzzling, e.g. the template errors. Anyway, I cannot do it myself in the upcoming months :(

pavlexander commented 6 years ago

I am not really c++ guy so lot's of code (95% of it) does not make any sense :)

Anyway, after hearing your answer - I started to dig into issues one by one, and resolved all of them.

Here are fixes that I have applied:

  1. Fix. kmcuda.cc Before:
    #if CUDA_ARCH < 60
    if (fp16x2) {
    INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
    return kmcudaInvalidArguments;
    }
    #endif
    if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
        INFO("compute capability mismatch for device %d: wanted %d.%d, have "
             "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
             "(refer to \"Building\" in README.md)\n",
             dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
             props.major * 10 + props.minor);
        devs.pop_back();

After:

/*#if CUDA_ARCH < 60
  if (fp16x2) {
    INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
    return kmcudaInvalidArguments;
  }
#endif*/
      /*if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
        INFO("compute capability mismatch for device %d: wanted %d.%d, have "
             "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
             "(refer to \"Building\" in README.md)\n",
             dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
             props.major * 10 + props.minor);
        devs.pop_back();
      }*/

Description: CUDA_ARCH variable is not available in host code. only CUDA can access it, so compile error was thrown.

  1. Fix. metric_abstraction.h Before:
    //
    // distance and normalization functions.
    //

After:

//
// distance and normalization functions.
//
#define _USE_MATH_DEFINES
#include <math.h>

Description: M_PI global variable was not available in scope of file, apparently library contains it. The fix shall be added at the beginning of file. This is very important, apparently.

  1. Fix. metric_abstraction.h Before:
    FPATTR static typename HALF<F>::type distance(
      F sqr1 __attribute__((unused)), F sqr2 __attribute__((unused)), F prod) {
    float fp = _float(_fin(prod));
    if (fp >= 1.f) return _half<F>(0.f);
    if (fp <= -1.f) return _half<F>(M_PI);
    return _half<F>(acos(fp));
    }

After

  FPATTR static typename HALF<F>::type distance(
      F sqr1, F sqr2, F prod) {
    float fp = _float(_fin(prod));
    if (fp >= 1.f) return _half<F>(0.f);
    if (fp <= -1.f) return _half<F>(M_PI);
    return _half<F>(acos(fp));
  }

Description: that's right, I just removed __attribute__((unused)). I have no idea what this attribute does, but, if isn't not used in any case - then how can an attribute make it worse? :) I don't know the consequences but many errors were fixed after this change. p.s. worth mentioning that I have removed all __attribute__((unused)) occurrences in the same file. Not just for 1 function.

  1. Fix. tricks.cuh Before:
    #include <cstdint>

    After:

    #include <cstdint>
    #include "private.h"

Description: some functions from private.h are used, so.. must include it.

  1. Fix. wrappers.h Before:
    
    template <typename T>
    class unique_devptr : public unique_devptr_parent<T> {
    public:
    explicit unique_devptr(T *ptr, bool fake = false) : unique_devptr_parent<T>(
      ptr, fake? [](T*){} : [](T *p){ cudaFree(p); }) {}
    };
After:

template class unique_devptr : public unique_devptr_parent { public: explicit unique_devptr(T ptr, bool fake = false) : unique_devptr_parent( ptr, (fake == true) ? (std::function<void(T)>)( {}) : [](T *p) { cudaFree(p); } ) {} };


Description:
Apparently - an explicit type conversion is needed. A discussion was brought up on [stack-overflow](https://stackoverflow.com/questions/38268299/no-user-defined-conversion-operator-available-that-can-perform-this-conversion) with similar issue. I have no idea what the problem is, but the fix is working as expected.

So, this seems to be it. I am yet to test if the code is working on GPU, but, at least the project gets compiled now.. If you have got any comments or concerns please share them :) 
pavlexander commented 6 years ago

p.s. code is workingon GPU just fine. Pull request will follow and possibly a separate wrapper project.