Open FitzShen666 opened 2 years ago
您好: 您的邮件我已收到,我会尽快回复。 刘洪宇
修改fake_quantize.cu,如下:
global void max_reduce(float restrict data, float out_ptr, int width, int lg_n) { shared float middleware[blockSize]; const float min_positive_float = 1e-6; int row = blockIdx.x * width + threadIdx.x; int bid = blockIdx.x; int tid = threadIdx.x; int tid_tmp = threadIdx.x;
if (tid < width)
middleware[tid] = data[row];
else
middleware[tid] = min_positive_float;
row += blockSize;
tid_tmp += blockSize;
while (tid_tmp < width)
{
if (fabs(data[row]) > fabs(middleware[tid]))
middleware[tid] = data[row];
row += blockSize;
tid_tmp += blockSize;
}
__syncthreads();
for (int i = lg_n / 2; i > 0; i /= 2)
{
if (tid < i)
{
if (fabs(middleware[tid + i]) > fabs(middleware[tid]))
middleware[tid] = middleware[tid + i];
}
__syncthreads();
}
if (tid == 0)
out_ptr[bid] = fabs(middleware[0]);
}
global void fake_quantize_layer_google(float restrict a,
float o,
float o1,
float mov_max,
int size,
int bit_width,
float max_entry)
{
int index = blockIdx.x blockDim.x + threadIdx.x;
if (index < size)
{
const float momenta = 0.95;
float mov_max_tmp = mov_max[0];
if(mov_max_tmp<1e-6) mov_max_tmp=fabs(max_entry); //movMax dafault 0 ,now first step set it a non zero data
else mov_max_tmp= mov_max_tmp momenta + fabs(max_entry) (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))(1-momenta) momenta = 0.95
float data_scale = __powf(2.,bit_width-1.)-1;
float scale;
if(mov_max_tmp < 1e-6) scale = __fdividef(data_scale,1e-6);
else scale = __fdividef(data_scale,mov_max_tmp);
int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] = __fdividef(o_int*1.,scale);
if(index==0)
{
o1[0] = scale;
mov_max[0] = mov_max_tmp;
}
}
}
global void fake_quantize_layer_aciq(float restrict a,
float o,
float o1,
float mov_max,
int feature_pixl_num,
int size,
int bit_width,
float max_entry)
{
int index = blockIdx.x blockDim.x + threadIdx.x;
if (index < size)
{
const float momenta = 0.95;
float mov_max_tmp = mov_max[0];
if(mov_max_tmp<1e-6) mov_max_tmp=fabs(max_entry); //movMax dafault 0 ,now first step set it a non zero data
else mov_max_tmp= fabs(max_entry);//mov_max_tmp momenta + fabs(max_entry) (1.-momenta); // #GOOGLE QAT : movMax = movMaxmomenta + max(abs(tensor))*(1-momenta) momenta = 0.95
float data_scale = __powf(2.,bit_width-1.)-1;
const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * __logf(4.)));
double std = (mov_max_tmp * 2 * gaussian_const) / sqrt(2 * __logf(feature_pixl_num));
float threshold = (float)(alpha_gaussian[bit_width - 1] * std);
float scale;
if(threshold < 1e-6) scale = __fdividef(data_scale,1e-6);
else scale = __fdividef(data_scale,threshold);
//float o_index = __fdividef(round(a[index]*scale),scale);
int o_int = round(a[index]*scale);
//o[index] = __fdividef(round(a[index]*scale),scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] = __fdividef(o_int*1.,scale);
if(index==0)
{
o1[0] = scale;
mov_max[0] = mov_max_tmp;
}
}
}
global void fake_quantize_channel_aciq(float restrict a,
float o,
float o1,
int size,
int bit_width,
float max_entry_arr, //max_entry_arr already>0
int channel_num)
{
int index = blockIdx.x blockDim.x + threadIdx.x;
if (index < size)
{
int channel = index/channel_num;
float max_entry = max_entry_arr+channel;
float data_scale = powf(2.,bit_width-1.)-1;
if((max_entry) < 1e-6)
{
//if(index%channel_num==0) o1[channel] = scale;
max_entry = 1e-6;
//return;
}
const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
const double gaussian_const = (0.5 0.35) (1 + sqrt(3.14159265358979323846 * logf(4.)));
double std = ((max_entry) 2 gaussian_const) / sqrt(2 __logf(channel_num));
float threshold = (float)(alpha_gaussian[bit_width - 1] * std);
float scale = __fdividef(data_scale,threshold);
int o_int = round(a[index]*scale);
if(o_int>data_scale) o_int=(int)data_scale;
else if(o_int<-data_scale) o_int=(int)(-data_scale);
else {};
o[index] = __fdividef(o_int*1.,scale);
if(index%channel_num==0) o1[channel] = scale;
}
}
global void fake_quantize_channel_cuda(float restrict a,
float o,
float o1,
int size,
int bit_width,
float max_entry_arr, //max_entry_arr already>0
int channel_num)
{
int index = blockIdx.x blockDim.x + threadIdx.x;
if (index < size)
{
int channel = index/channel_num;
float max_entry = max_entry_arr+channel;
float data_scale = powf(2.,bit_width-1.)-1;
if((max_entry) < 1e-6)
{
//if(index%channel_num==0) o1[channel] = scale;
max_entry = 1e-6;
//return;
}
float scale = fdividef(data_scale,max_entry);
o[index] = __fdividef(round(a[index]scale),scale);
if(index%channel_num==0) o1[channel] = scale;
}
}
std::vector
int batch_size = a.size(0);//batchsize
int feature_pixl_num = size/batch_size;
Tensor max_entry = at::max(at::abs(a));
int blockNums = (size + blockSize - 1) / blockSize;
if(aciq==0) //movmax
{
//printf("layer_max....");
fake_quantize_layer_google<<<blockNums, blockSize>>>(a.data_ptr<float>(),
o.data_ptr<float>(),
o1.data_ptr<float>(),
mov_max.data_ptr<float>(),
size,
bit_width,
max_entry.data_ptr<float>());
}
else // aciq
{
//printf("layer_aciq....");
fake_quantize_layer_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
o.data_ptr<float>(),
o1.data_ptr<float>(),
mov_max.data_ptr<float>(),
feature_pixl_num,
size,
bit_width,
max_entry.data_ptr<float>());
}
return {o,o1,mov_max};
}
std::vector
int blockNums = (size + blockSize - 1) / blockSize;
int channel_num = size/c;
auto max_entry_arr = at::zeros({c}, a.options());
int lg_n = ceil(log2(channel_num*1.)); //2^x - channel_num >0
lg_n = pow(2,lg_n); //2^x
if(lg_n>blockSize) lg_n=blockSize; //
max_reduce <<<c, blockSize >>> (a.data_ptr<float>(),
max_entry_arr.data_ptr<float>(),
channel_num,
lg_n); //c block , each block get a max value
if(aciq==0)
{
//printf("weight_max....");
fake_quantize_channel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
o.data_ptr<float>(),
o1.data_ptr<float>(),
size,
bit_width,
max_entry_arr.data_ptr<float>(), //max_entry_arr already>0
channel_num);
}
else
{
//printf("weight_aciq....");
fake_quantize_channel_aciq<<<blockNums, blockSize>>>(a.data_ptr<float>(),
o.data_ptr<float>(),
o1.data_ptr<float>(),
size,
bit_width,
max_entry_arr.data_ptr<float>(), //max_entry_arr already>0
channel_num);
}
return {o,o1};
}
std::vector
else return fake_quantize_weight_cuda(a,bit_width,c,aciq); //type==1 perchannel
}
全贴上就行了,不用管中间那个可以拷贝的区域还是别的
requirements.txt我写了: torch >= 1.6 numpy >= 1.18.1 onnx >= 1.7.0 onnx-simplifier >= 0.3.6
配套的python版本是3.7
如上是安装成功的标志
在使用pip install ncnnqat的时候会产生如下报错: File "setup.py", line 3, in
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
ModuleNotFoundError: No module named 'torch'
但log上的显示的python地址和版本里面确有安装torch
若直接对该版本进行编译,则会出现下述问题:
make[1]: Entering directory '/root/ncnnqat' NVCC src/fake_quantize.cu nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -M src/fake_quantize.cu -o obj/cuda/fake_quantize.d \ -odir obj/cuda nvcc -std=c++14 -ccbin=g++ -Xcompiler -fPIC -use_fast_math -DNDEBUG -O3 -I./ -I/usr/local/cuda/include -I/opt/conda/include/python3.7m -I/opt/conda/lib/python3.7/site-packages/torch/include -I/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.7/site-packages/torch/include/TH -I/opt/conda/lib/python3.7/site-packages/torch/include/THC -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=0 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -c src/fake_quantize.cu -o obj/cuda/fake_quantize.o /opt/conda/lib/python3.7/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "visibility" does not apply here
/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "visibility" does not apply here
/opt/conda/lib/python3.7/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "visibility" does not apply here
src/fake_quantize.cu(15): error: a value of type "const float " cannot be assigned to an entity of type "float "
src/fake_quantize.cu(21): error: identifier "Row" is undefined
src/fake_quantize.cu(88): warning: variable "momenta" was declared but never referenced
/opt/conda/lib/python3.7/site-packages/torch/include/c10/util/TypeCast.h(27): warning: calling a constexpr host function("real") from a host device function("apply") is not allowed. The experimental flag '--expt-relaxed-constexpr' can be used to allow this. detected during: instantiation of "decltype(auto) c10::maybe_real<true, src_t>::apply(src_t) [with src_t=c10::complex]"
(57): here
instantiation of "uint8_t c10::static_cast_with_inter_type<uint8_t, src_t>::apply(src_t) [with src_t=c10::complex]"
(166): here
instantiation of "To c10::convert<To,From>(From) [with To=uint8_t, From=c10::complex]"
(178): here
instantiation of "To c10::checked_convert<To,From>(From, const char *) [with To=uint8_t, From=c10::complex]"
/opt/conda/lib/python3.7/site-packages/torch/include/c10/core/Scalar.h(66): here
2 errors detected in the compilation of "/tmp/tmpxft_00000066_00000000-11_fake_quantize.compute_75.cpp1.ii". Makefile:70: recipe for target 'obj/cuda/fake_quantize.o' failed make[1]: [obj/cuda/fake_quantize.o] Error 1 make[1]: Leaving directory '/root/ncnnqat' running install running bdist_egg running egg_info writing ncnnqat.egg-info/PKG-INFO writing dependency_links to ncnnqat.egg-info/dependency_links.txt writing requirements to ncnnqat.egg-info/requires.txt writing top-level names to ncnnqat.egg-info/top_level.txt reading manifest file 'ncnnqat.egg-info/SOURCES.txt' reading manifest template 'MANIFEST.in' writing manifest file 'ncnnqat.egg-info/SOURCES.txt' installing library code to build/bdist.linux-x86_64/egg running install_lib running build_py running build_ext building 'quant_cuda' extension Emitting ninja build file /root/ncnnqat/build/temp.linux-x86_64-3.7/build.ninja... Compiling objects... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. g++ -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ /root/ncnnqat/build/temp.linux-x86_64-3.7/./src/fake_quantize.o -Lobj -L/opt/conda/lib/python3.7/site-packages/torch/lib -L/usr/local/cuda/lib64 -lquant_cuda -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.7/quant_cuda.cpython-37m-x86_64-linux-gnu.so /opt/conda/compiler_compat/ld: cannot find -lquant_cuda collect2: error: ld returned 1 exit status error: command 'g++' failed with exit status 1 Makefile:106: recipe for target 'install' failed make: [install] Error 1
系统中安装的torch版本为1.6.0,cuda版本是10.1,也尝试了torch1.9.0,cuda10.2,均出现了同样的问题,请帮忙看看安装环境是否有存在问题的地方,感谢!