FitzShen666 commented 2 years ago

在使用pip install ncnnqat的时候会产生如下报错： File "setup.py", line 3, in from torch.utils.cpp_extension import BuildExtension, CUDAExtension ModuleNotFoundError: No module named 'torch'

但log上的显示的python地址和版本里面确有安装torch

若直接对该版本进行编译，则会出现下述问题：

make[1]: Entering directory '/root/ncnnqat' NVCC src/fake_quantize.cu nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -M src/fake_quantize.cu -o obj/cuda/fake_quantize.d \ -odir obj/cuda nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -c src/fake_quantize.cu -o obj/cuda/fake_quantize.o /opt/conda/lib/python3.7/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "visibility" does not apply here

/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "visibility" does not apply here

/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "visibility" does not apply here

src/fake_quantize.cu(15): error: a value of type "const float " cannot be assigned to an entity of type "float "

src/fake_quantize.cu(21): error: identifier "Row" is undefined

src/fake_quantize.cu(88): warning: variable "momenta" was declared but never referenced

/opt/conda/lib/python3.7/site-packages/torch/include/c10/util/TypeCast.h(27): warning: calling a constexpr host function("real") from a host device function("apply") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this. detected during: instantiation of "decltype(auto) c10::maybe_real<true, src_t>::apply(src_t) [with src_t=c10::complex]" (57): here instantiation of "uint8_t c10::static_cast_with_inter_type<uint8_t, src_t>::apply(src_t) [with src_t=c10::complex]" (166): here instantiation of "To c10::convert<To,From>(From) [with To=uint8_t, From=c10::complex]" (178): here instantiation of "To c10::checked_convert<To,From>(From, const char *) [with To=uint8_t, From=c10::complex]" /opt/conda/lib/python3.7/site-packages/torch/include/c10/core/Scalar.h(66): here

2 errors detected in the compilation of "/tmp/tmpxft_00000066_00000000-11_fake_quantize.compute_75.cpp1.ii". Makefile:70: recipe for target 'obj/cuda/fake_quantize.o' failed make[1]: [obj/cuda/fake_quantize.o] Error 1 make[1]: Leaving directory '/root/ncnnqat' running install running bdist_egg running egg_info writing ncnnqat.egg-info/PKG-INFO writing dependency_links to ncnnqat.egg-info/dependency_links.txt writing requirements to ncnnqat.egg-info/requires.txt writing top-level names to ncnnqat.egg-info/top_level.txt reading manifest file 'ncnnqat.egg-info/SOURCES.txt' reading manifest template 'MANIFEST.in' writing manifest file 'ncnnqat.egg-info/SOURCES.txt' installing library code to build/bdist.linux-x86_64/egg running install_lib running build_py running build_ext building 'quant_cuda' extension Emitting ninja build file /root/ncnnqat/build/temp.linux-x86_64-3.7/build.ninja... Compiling objects... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ /root/ncnnqat/build/temp.linux-x86_64-3.7/./src/fake_quantize.o -Lobj -L/opt/conda/lib/python3.7/site-packages/torch/lib -L/usr/local/cuda/lib64 -lquant_cuda -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.7/quant_cuda.cpython-37m-x86_64-linux-gnu.so /opt/conda/compiler_compat/ld: cannot find -lquant_cuda collect2: error: ld returned 1 exit status error: command 'g++' failed with exit status 1 Makefile:106: recipe for target 'install' failed make: [install] Error 1

系统中安装的torch版本为1.6.0，cuda版本是10.1，也尝试了torch1.9.0，cuda10.2，均出现了同样的问题，请帮忙看看安装环境是否有存在问题的地方，感谢！

yolunghiu commented 1 year ago

您好：您的邮件我已收到，我会尽快回复。刘洪宇

dentionY commented 1 year ago

修改fake_quantize.cu，如下：

include "fake_quantize.h"

global void max_reduce(float restrict data, float out_ptr, int width, int lg_n) { shared float middleware[blockSize]; const float min_positive_float = 1e-6; int row = blockIdx.x * width + threadIdx.x; int bid = blockIdx.x; int tid = threadIdx.x; int tid_tmp = threadIdx.x;

if (tid < width)
    middleware[tid] = data[row];
else
    middleware[tid] = min_positive_float;
row += blockSize;
tid_tmp += blockSize;
while (tid_tmp < width)
{
    if (fabs(data[row]) > fabs(middleware[tid]))
        middleware[tid] = data[row];
    row += blockSize;
    tid_tmp += blockSize;
}
__syncthreads();

for (int i = lg_n / 2; i > 0; i /= 2)
{
    if (tid < i)
    {
        if (fabs(middleware[tid + i]) > fabs(middleware[tid]))
            middleware[tid] = middleware[tid + i];
    }
    __syncthreads();
}

if (tid == 0)
    out_ptr[bid] = fabs(middleware[0]);

}

global void fake_quantize_layer_google(float restrict a, float o,
float o1,
float mov_max,
int size, int bit_width, float max_entry) { int index = blockIdx.x blockDim.x + threadIdx.x; if (index < size) { const float momenta = 0.95; float mov_max_tmp = mov_max[0]; if(mov_max_tmp<1e-6) mov_max_tmp=fabs(max_entry); //movMax dafault 0 ,now first step set it a non zero data else mov_max_tmp= mov_max_tmp momenta + fabs(max_entry) (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))(1-momenta) momenta = 0.95 float data_scale = __powf(2.,bit_width-1.)-1;

float scale;
    if(mov_max_tmp < 1e-6) scale =  __fdividef(data_scale,1e-6);
else scale =  __fdividef(data_scale,mov_max_tmp);

int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] =  __fdividef(o_int*1.,scale);

if(index==0) 
{
    o1[0] = scale;
    mov_max[0] = mov_max_tmp;
}
}

}

global void fake_quantize_layer_aciq(float restrict a, float o,
float o1,
float mov_max, int feature_pixl_num, int size, int bit_width, float max_entry) { int index = blockIdx.x blockDim.x + threadIdx.x; if (index < size) { const float momenta = 0.95; float mov_max_tmp = mov_max[0]; if(mov_max_tmp<1e-6) mov_max_tmp=fabs(max_entry); //movMax dafault 0 ,now first step set it a non zero data else mov_max_tmp= fabs(max_entry);//mov_max_tmp momenta + fabs(max_entry) (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))*(1-momenta) momenta = 0.95 float data_scale = __powf(2.,bit_width-1.)-1;

    const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
    const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
    double std = (mov_max_tmp * 2 * gaussian_const) / sqrt(2 * __logf(feature_pixl_num));
    float threshold = (float)(alpha_gaussian[bit_width - 1] * std);

    float scale;
    if(threshold < 1e-6) scale =  __fdividef(data_scale,1e-6);
    else scale =  __fdividef(data_scale,threshold);
//float o_index = __fdividef(round(a[index]*scale),scale);
int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] =  __fdividef(o_int*1.,scale);

if(index==0) 
{
    o1[0] = scale;
    mov_max[0] = mov_max_tmp;
}
}

}

global void fake_quantize_channel_aciq(float restrict a, float o,
float o1,
int size, int bit_width, float max_entry_arr, //max_entry_arr already>0 int channel_num) { int index = blockIdx.x blockDim.x + threadIdx.x; if (index < size) { int channel = index/channel_num; float max_entry = max_entry_arr+channel; float data_scale = powf(2.,bit_width-1.)-1; if((max_entry) < 1e-6) { //if(index%channel_num==0) o1[channel] = scale; max_entry = 1e-6; //return; } const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714}; const double gaussian_const = (0.5 0.35) (1 + sqrt(3.14159265358979323846 * logf(4.))); double std = ((max_entry) 2 gaussian_const) / sqrt(2 __logf(channel_num)); float threshold = (float)(alpha_gaussian[bit_width - 1] * std);

float scale =  __fdividef(data_scale,threshold);
int o_int = round(a[index]*scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] = __fdividef(o_int*1.,scale);
if(index%channel_num==0) o1[channel] = scale;
}

} global void fake_quantize_channel_cuda(float restrict a, float o,
float o1,
int size, int bit_width, float max_entry_arr, //max_entry_arr already>0 int channel_num) { int index = blockIdx.x blockDim.x + threadIdx.x; if (index < size) { int channel = index/channel_num; float max_entry = max_entry_arr+channel; float data_scale = powf(2.,bit_width-1.)-1; if((max_entry) < 1e-6) { //if(index%channel_num==0) o1[channel] = scale; max_entry = 1e-6; //return; } float scale = fdividef(data_scale,max_entry); o[index] = __fdividef(round(a[index]scale),scale); if(index%channel_num==0) o1[channel] = scale; } } std::vector fake_quantize_activate_cuda(Tensor a, int bit_width ,int aciq) { auto o = at::zeros_like(a); //q out auto o1 = at::zeros({1}, a.options()); //scale auto mov_max = at::zeros({1}, a.options()); //max of tensor #GOOGLE QAT movMax = movMaxmomenta + max(abs(tensor))(1-momenta) momenta = 0.95 int64_t size = a.numel();

int batch_size = a.size(0);//batchsize
int feature_pixl_num = size/batch_size;

Tensor max_entry = at::max(at::abs(a));
int blockNums = (size + blockSize - 1) / blockSize;

if(aciq==0) //movmax
{
//printf("layer_max....");
fake_quantize_layer_google<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                             o.data_ptr<float>(),
                             o1.data_ptr<float>(),
                             mov_max.data_ptr<float>(),
                             size,
                             bit_width,
                             max_entry.data_ptr<float>());
}
else // aciq
{
//printf("layer_aciq....");
fake_quantize_layer_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                           o.data_ptr<float>(),
                           o1.data_ptr<float>(),
                           mov_max.data_ptr<float>(),
                           feature_pixl_num,
                           size,
                           bit_width,
                           max_entry.data_ptr<float>());
}
return {o,o1,mov_max};

}

std::vector fake_quantize_weight_cuda(Tensor a, int bit_width,int c ,int aciq) { auto o = at::zeros_like(a); //q out auto o1 = at::zeros({c}, a.options()); //scale int64_t size = a.numel();

int blockNums = (size + blockSize - 1) / blockSize;
int channel_num = size/c;
auto max_entry_arr = at::zeros({c}, a.options());

int lg_n = ceil(log2(channel_num*1.)); //2^x - channel_num >0 
lg_n = pow(2,lg_n); //2^x
if(lg_n>blockSize) lg_n=blockSize; //

max_reduce <<<c, blockSize >>> (a.data_ptr<float>(),
                max_entry_arr.data_ptr<float>(),
                channel_num,
                lg_n); //c block , each block get a max value

if(aciq==0)
{
//printf("weight_max....");
fake_quantize_channel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                             o.data_ptr<float>(),
                             o1.data_ptr<float>(),
                             size,
                             bit_width,
                             max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0
                             channel_num);
}
else
{
//printf("weight_aciq....");
fake_quantize_channel_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                             o.data_ptr<float>(),
                             o1.data_ptr<float>(),
                             size,
                             bit_width,
                             max_entry_arr.data_ptr<float>(),  //max_entry_arr already>0
                             channel_num);  

}       
return {o,o1};

}

std::vector fake_quantize_cuda(Tensor a, int bit_width,int type,int c,int aciq) { / https://arxiv.org/pdf/1806.08342.pdf 2.5 For weights,we use the actual minimum and maximum values to determine the quantizer parameters. For activations, we use the moving average of the minimum and maximum values across batches to determine the quantizer parameters. float 6 7 ,double 15 16 / if(type==0) return fake_quantize_activate_cuda(a,bit_width,aciq); //type==0 per layer
else return fake_quantize_weight_cuda(a,bit_width,c,aciq); //type==1 perchannel }

dentionY commented 1 year ago

全贴上就行了，不用管中间那个可以拷贝的区域还是别的

dentionY commented 1 year ago

requirements.txt我写了： torch >= 1.6 numpy >= 1.18.1 onnx >= 1.7.0 onnx-simplifier >= 0.3.6

配套的python版本是3.7

dentionY commented 1 year ago

如上是安装成功的标志

ChenShisen / ncnnqat

ncnnqat的环境问题 #1

include "fake_quantize.h"