sowson / darknet

Darknet on OpenCL Convolutional Neural Networks on OpenCL on Intel & NVidia & AMD & Mali GPUs for macOS & GNU/Linux & Windows & FreeBSD
http://pjreddie.com/darknet/
Other
187 stars 31 forks source link

Error while running darknet binary #34

Closed zakamik closed 4 years ago

zakamik commented 4 years ago

Hello, I have just run into the problem when tried to use your code:

./darknet detect cfg/yolov2.cfg yolov2.weights ./snapshot.jpg -thresh 0.2

Device IDs: 1 Device ID: 0 Device name: Oland Device vendor: Advanced Micro Devices, Inc. Device opencl availability: OpenCL 1.2 AMD-APP (3004.6) Device opencl used: 3004.6 Device double precision: YES Device max group size: 256 Device address bits: 64 opencl_load: could not compile. error: CL_COMPILE_PROGRAM_FAILURE CL_PROGRAM_BUILD_LOG: "/tmp/OCL6829T3.cl", line 1: error: function "atom_cmpxchg" declared implicitly

kernel void test_kernel(int N, global float *input, global float output, __global float expected) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; output[index] = sqrt(input[index]); index += 1; input[index] = output[index-1]; output[index] = log(input[index]); index += 1; input[index] = output[index-1]; output[index] = pow(input[index], output[index-2]); index += 1; input[index] = output[index-1]; output[index] = -exp(input[index]); index += 1; input[index] = output[index-1]; output[index] = fabs(input[index]); index += 1; input[index] = output[index-1]; output[index] = sin(input[index]); index += 1; input[index] = output[index-1]; output[index] = cos(input[index]); } static void atomicAdd(volatile __global float a, float v) { union { float f; unsigned int i; } o; o.i = 0; union { float f; unsigned int i; } n; n.i = 1; do { o.f = a; n.f = o.f + v; } while (atom_cmpxchg((global unsigned int )a, o.i, n.i) != o.i); } kernel void scale_bias_kernel(int N, global float output, global float biases, int batch, int n, int size) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; int i = (index % (nsize) / size); output[index] *= biases[i]; } kernel void backward_scale_kernel(int N, int batch, int n, int size, global float x_norm, __global float delta, __global float scale_updates) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; int i = (index % (nsize) / size); scale_updates[i] += delta[index]*x_norm[index]; } kernel void add_bias_kernel(int N, global float *output, global float biases, int batch, int n, int size) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; int i = (index % (nsize) / size); output[index] += biases[i]; } kernel void backward_bias_kernel(int N, int batch, int n, int size, global float bias_updates, __global float delta) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; int i = (index % (nsize) / size); bias_updates[i] += delta[index]; } __kernel void mean_kernel(int N, __global float x, int batch, int filters, int spatial, global float mean) { float scale = 1.f/(batch spatial); int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int i = id; mean[i] = 0; int j, k; for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j filters spatial + i spatial + k; mean[i] += x[index]; } } mean[i] = scale; } kernel void variance_kernel(int N, global float *x, global float mean, int batch, int filters, int spatial, __global float variance) { float scale = 1.f/(batch spatial - 1); int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int i = id; variance[i] = 0; int j,k; for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j filters spatial + i spatial + k; variance[i] += pow((x[index] - mean[i]), 2); } } variance[i] = scale; } kernel void mean_delta_kernel(int N, global float delta, global float *variance, int batch, int filters, int spatial, global float mean_delta) { int id = (get_group_id(0) + get_group_id(1) get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int i = id; mean_delta[i] = 0; int j, k; for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j filters spatial + i spatial + k; mean_delta[i] += delta[index]; } } mean_delta[i] = (-1.f/sqrt(variance[i] + .00001f)); } kernel void variance_delta_kernel(int N, global float x, global float *delta, global float mean, __global float variance, int batch, int filters, int spatial, global float variance_delta) { int id = (get_group_id(0) + get_group_id(1) get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int i = id; variance_delta[i] = 0; int j,k; for (j = 0; j < batch; ++j) { for (k = 0; k < spatial; ++k) { int index = j filters spatial + i spatial + k; variance_delta[i] += delta[index] (x[index] - mean[i]); } } variance_delta[i] = -.5f * pow(variance[i] + .00001f, (float)(-3.f/2.f)); } kernel void accumulate_kernel(global float *x, int n, int groups, global float sum) { int k; int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (i >= groups) return; sum[i] = 0; for(k = 0; k < n; ++k){ sum[i] += x[kgroups + i]; } } kernel void fast_mean_kernel(int tuning, local float sums, int filters, int batch, int spatial, __global float x, global float mean) { int t = get_global_id(0); if (t >= tuning) return; int i = get_global_id(1); if (i >= filters) return; sums[t] = 0; int j, k, s; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j filters spatial + i spatial + k; sums[t] += x[index]; } } barrier(CLK_GLOBAL_MEM_FENCE); if (t == tuning-1) { mean[i] = 0; for (s = 0; s < tuning; ++s) { mean[i] += sums[s]; } mean[i] /= (spatial batch); } } kernel void fast_variance_kernel(int tuning, local float sums, int filters, int batch, int spatial, global float *x, global float mean, __global float variance) { int t = get_global_id(0); if (t >= tuning) return; int i = get_global_id(1); if (i >= filters) return; sums[t] = 0; int j,k,s; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j filters spatial + i spatial + k; sums[t] += pow((x[index] - mean[i]), 2); } } barrier(CLK_GLOBAL_MEM_FENCE); if (t == tuning-1) { variance[i] = 0; for (s = 0; s < tuning; ++s) { variance[i] += sums[s]; } variance[i] /= (spatial batch - 1); } } kernel void fast_mean_delta_kernel(int tuning, local float *sums, int filters, int batch, int spatial, global float variance, __global float delta, global float mean_delta) { int t = get_global_id(0); if (t >= tuning) return; int i = get_global_id(1); if (i >= filters) return; sums[t] = 0; int j,k,s; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j filters spatial + i spatial + k; sums[t] += delta[index]; } } barrier(CLK_GLOBAL_MEM_FENCE); if (t == tuning-1) { mean_delta[i] = 0; for (s = 0; s < tuning; ++s) { mean_delta[i] += sums[s]; } mean_delta[i] *= (-1.f/sqrt(variance[i] + .00001f)); } } __kernel void fast_variance_delta_kernel(int tuning, local float sums, int filters, int batch, int spatial, __global float x, global float *variance, global float delta, __global float mean, global float variance_delta) { int t = get_global_id(0); if (t >= tuning) return; int i = get_global_id(1); if (i >= filters) return; sums[t] = 0; int j,k,s; for (j = 0; j < batch; ++j) { for (k = t; k < spatial; k += tuning) { int index = j filters spatial + i spatial + k; sums[t] += delta[index] (x[index] - mean[i]); } } barrier(CLK_GLOBAL_MEM_FENCE); if (t == tuning-1) { variance_delta[i] = 0; for (s = 0; s < tuning; ++s) { variance_delta[i] += sums[s]; } variance_delta[i] = -.5f * pow(variance[i] + .00001f, (float)(-3.f/2.f)); } } __kernel void adam_kernel(int N, global float x, __global float m, global float v, float B1, float B2, float rate, float eps, int t) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; x[index] = x[index] + (rate sqrt(1.f-pow(B2, t)) / (1.f-pow(B1, t)) * m[index] / (sqrt((v[index] + eps)))); } kernel void normalize_kernel(int N, global float *x, global float mean, __global float variance, int batch, int filters, int spatial) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int b = (id / (filtersspatial)); int f = (id % (filtersspatial) / spatial); int i = (id % spatial); int index = bfiltersspatial + fspatial + i; x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f)); } kernel void normalize_delta_kernel(int N, global float x, global float *mean, global float variance, __global float mean_delta, global float variance_delta, int batch, int filters, int spatial, __global float delta) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= N) return; int j = (id / (filtersspatial)); int f = (id % (filtersspatial) / spatial); int k = (id % spatial); int index = jfiltersspatial + fspatial + k; delta[index] = delta[index] 1.f/(sqrt(variance[f] + .00001f)) + variance_delta[f] 2. (x[index] - mean[f]) / (spatial batch) + mean_delta[f]/(spatialbatch); } __kernel void l2norm_kernel(int N, global float x, __global float dx, int batch, int filters, int spatial) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (index >= N) return; int b = index / spatial; int i = index % spatial; int f; float sum = 0; for(f = 0; f < filters; ++f){ int index = bfiltersspatial + fspatial + i; sum += pow(x[index], 2.f); } sum = sqrt(sum); if(sum == 0) sum = 1.f; for(f = 0; f < filters; ++f){ int index = bfiltersspatial + fspatial + i; x[index] /= sum; dx[index] = (1 - x[index]) / sum; } } kernel void reorg_kernel(int N, global float *x, int w, int h, int c, int batch, int stride, int forward, global float out) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i >= N) return; int in_index = i; int in_w = i%w; i = i/w; int in_h = i%h; i = i/h; int in_c = i%c; i = i/c; int b = i%batch; int out_c = c/(stridestride); int c2 = in_c % out_c; int offset = in_c / out_c; int w2 = in_wstride + offset % stride; int h2 = in_hstride + offset / stride; int out_index = w2 + wstride(h2 + hstride(c2 + out_c*b)); if(forward) out[out_index] = x[in_index]; else out[in_index] = x[out_index]; } kernel void axpy_kernel(int N, const float ALPHA, global float X, int OFFX, int INCX, __global float Y, int OFFY, int INCY) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) Y[iINCY+OFFY] += ALPHAX[iINCX+OFFX]; } __kernel void pow_kernel(int N, const float ALPHA, global float X, int OFFX, int INCX, global float Y, int OFFY, int INCY) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) Y[iINCY + OFFY] = pow(X[i*INCX + OFFX], ALPHA); } kernel void const_kernel(int N, const float ALPHA, global float X, int OFFX, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) X[iINCX + OFFX] = ALPHA; } kernel void constrain_kernel(int N, const float ALPHA, global float X, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) X[iINCX] = min(ALPHA, max(-ALPHA, X[i*INCX])); } __kernel void supp_kernel(int N, const float ALPHA, global float X, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) { if((X[iINCX] X[iINCX]) < (ALPHA ALPHA)) X[iINCX] = 0; } } kernel void add_kernel(int N, const float ALPHA, global float X, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) X[iINCX] += ALPHA; } kernel void scal_kernel(int N, const float ALPHA, global float X, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) X[iINCX] *= ALPHA; } __kernel void fill_kernel(int N, const float ALPHA, global float X, int OFFX, int INCX) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) X[iINCX + OFFX] = ALPHA; } kernel void mask_kernel(int n, global float x, float mask_num, __global float mask, float val) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n && mask[i] == mask_num) x[i] = val; } kernel void copy_kernel(int N, global float *X, int OFFX, int INCX, global float Y, int OFFY, int INCY) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) Y[iINCY + OFFY] = X[iINCX + OFFX]; } __kernel void mul_kernel(int N, __global float X, int INCX, global float Y, int INCY) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < N) Y[iINCY] = X[iINCX]; } kernel void flatten_kernel(int N, global float *x, int spatial, int layers, int batch, int forward, global float out) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i >= N) return; int in_s = i%spatial; i = i/spatial; int in_c = i%layers; i = i/layers; int b = i; int i1 = blayersspatial + in_cspatial + in_s; int i2 = blayersspatial + in_slayers + in_c; if (forward) out[i2] = x[i1]; else out[i1] = x[i2]; } __kernel void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, __global float add, int w2, int h2, int c2, float s1, float s2, global float out) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= size) return; int i = id % minw; id /= minw; int j = id % minh; id /= minh; int k = id % minc; id /= minc; int b = id % batch; int out_index = isample + w2(jsample + h2(k + c2b)); int add_index = istride + w1(jstride + h1(k + c1b)); out[out_index] = s1out[out_index] + s2add[add_index]; } kernel void smooth_l1_kernel(int n, global float pred, global float *truth, global float delta, __global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n){ float diff = truth[i] - pred[i]; float abs_val = fabs(diff); if(abs_val < 1) { error[i] = diff diff; delta[i] = diff; } else { error[i] = 2abs_val - 1; delta[i] = (diff > 0) ? 1 : -1; } } } kernel void softmax_x_ent_kernel(int n, global float *pred, global float truth, __global float delta, global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) * get_local_size(0) + get_local_id(0); if(i < n) { float t = truth[i]; float p = pred[i]; error[i] = (t!=0) ? -log(p) : 0; delta[i] = t-p; } } kernel void logistic_x_ent_kernel(int n, global float *pred, global float truth, __global float delta, global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n){ float t = truth[i]; float p = pred[i]; error[i] = -tlog(p+.0000001f) - (1.f-t)*log(1.f-p+.0000001f); delta[i] = t-p; } } kernel void l2_kernel(int n, global float *pred, global float truth, __global float delta, global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) * get_local_size(0) + get_local_id(0); if(i < n){ float t = truth[i]; float p = pred[i]; float diff = t-p; error[i] = pow(diff,2); delta[i] = diff; } } kernel void l1_kernel(int n, global float *pred, global float truth, __global float delta, global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) * get_local_size(0) + get_local_id(0); if(i < n){ float diff = truth[i] - pred[i]; error[i] = fabs(diff); delta[i] = (diff > 0) ? 1 : -1; } } kernel void wgan_kernel(int n, global float *pred, global float truth, __global float delta, global float error) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) * get_local_size(0) + get_local_id(0); if(i < n){ error[i] = (truth[i]!=0) ? -pred[i] : pred[i]; delta[i] = (truth[i] > 0) ? 1 : -1; } } kernel void weighted_sum_kernel(int n, global float *a, global float b, __global float s, global float c) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n){ c[i] = s[i]a[i] + (1-s[i])*(b ? b[i] : 0); } } kernel void weighted_delta_kernel(int n, global float *a, global float b, __global float s, global float *da, global float db, __global float ds, global float dc) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n){ if(da) da[i] += dc[i] s[i]; db[i] += dc[i] (1-s[i]); ds[i] += dc[i] a[i] + dc[i] * -b[i]; } } kernel void mult_add_into_kernel(int n, global float *a, global float b, __global float c) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n){ c[i] += a[i]b[i]; } } __kernel void deinter_kernel(int NX, __global float X, int NY, global float *Y, int B, global float OUT) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < (NX+NY)B){ int b = i / (NX+NY); int j = i % (NX+NY); if (j < NX){ if(X) X[bNX + j] += OUT[i]; } else { if(Y) Y[bNY + j - NX] += OUT[i]; } } } kernel void inter_kernel(int NX, global float *X, int NY, global float Y, int B, __global float OUT) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < (NX+NY)B){ int b = i / (NX+NY); int j = i % (NX+NY); if (j < NX){ OUT[i] = X[bNX + j]; } else { OUT[i] = Y[b*NY + j - NX]; } } } __kernel void softmax_device(global float input, int n, float temp, int stride, __global float output) { int i; float sum = 0; float largest = -FLT_MAX; for(i = 0; i < n; ++i){ int val = input[istride]; largest = (val>largest) ? val : largest; } for(i = 0; i < n; ++i){ float e = exp(input[istride]/temp - largest/temp); sum += e; output[istride] = e; } for(i = 0; i < n; ++i){ output[istride] /= sum; } } kernel void softmax_kernel(global float *input, int offset, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, global float output) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= batchgroups) return; int b = id / groups; int g = id % groups; softmax_device(input + bbatch_offset + ggroup_offset + offset, n, temp, stride, output + bbatch_offset + ggroup_offset + offset); } kernel void softmax_tree_kernel(global float *input, int offset, int index, int spatial, int batch, int stride, float temp, global float output, int groups, __global float group_size, global float group_offset) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if (id >= spatialbatchgroups) return; int s = id % spatial; id = id / spatial; int g = id % groups; int b = id / groups; int goff = group_offset[g]spatial; int boff = b*stride; softmax_device(input + offset + goff + boff + s, group_size[g], temp, spatial, output + offset + goff + boff + s); } kernel void scale_mask_kernel(int n, global float x, float mask_num, __global float mask, float scale) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i < n && mask[i] == mask_num) x[i] *= scale; } kernel void dot_kernel(global float *output, float scale, int batch, int n, int size, global float delta) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); int f1 = index / n; int f2 = index % n; if (f2 <= f1) return; float sum = 0; float norm1 = 0; float norm2 = 0; int b, i; for(b = 0; b < batch; ++b){ for(i = 0; i < size; ++i){ int i1 = b size n + f1 size + i; int i2 = b size n + f2 size + i; sum += output[i1] output[i2]; norm1 += output[i1] output[i1]; norm2 += output[i2] output[i2]; } } norm1 = sqrt(fabs(norm1)); norm2 = sqrt(fabs(norm2)); float norm = norm1 norm2; sum = sum / norm; for(b = 0; b < batch; ++b){ for(i = 0; i < size; ++i){ int i1 = b size n + f1 size + i; int i2 = b size n + f2 size + i; delta[i1] += - scale sum output[i2] / norm; delta[i2] += - scale sum output[i1] / norm; } } } __kernel void upsample_kernel(int N, __global float x, int w, int h, int c, int batch, int stride, int forward, float scale, global float out) { int i = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(i >= N) return; int out_index = i; int out_w = i%(wstride); i = i/(wstride); int out_h = i%(hstride); i = i/(hstride); int out_c = i%c; i = i/c; int b = i%batch; int in_w = out_w / stride; int in_h = out_h / stride; int in_c = out_c; int in_index = bwhc + in_cwh + in_hw + in_w; if(forward) out[out_index] += scale x[in_index]; else atomicAdd(&x[in_index], scale * out[out_index]); } kernel void gemm_kernel( int tuning, local float* sums, int TA, int TB, int M, int N, int K, const float ALPHA, global float A, int offset_A, int lda, __global float B, int offset_B, int ldb, const float BETA, __global float C, int offset_C, int ldc) { int td = get_global_id(0); if (td > tuning) return; int id = get_global_id(1); if (id > NM) return; int iM = id / N; int jN = id % N; int kK = 0; int ts = 0; C[iM ldc + jN + offset_C] = BETA; sums[td] = 0; for(kK = td; kK < K; kK += tuning) { if (TA==0 && TB==0) { sums[td] += ALPHA A[iM lda + kK + offset_A] B[kK ldb + jN + offset_B]; } else if (TA==1 && TB==0) { sums[td] += ALPHA A[kK lda + iM + offset_A] B[kK ldb + jN + offset_B]; } else if (TA==0 && TB==1) { sums[td] += ALPHA A[iM lda + kK + offset_A] B[jN ldb + kK + offset_B]; } else { sums[td] += ALPHA A[iM + kK lda + offset_A] B[kK + jN ldb + offset_B]; } } barrier(CLK_GLOBAL_MEM_FENCE); if (td == tuning-1) { for(ts = 0; ts < tuning; ++ts) { if (TA==0 && TB==0) { C[iM ldc + jN + offset_C] += sums[ts]; } else if (TA==1 && TB==0) { C[iM ldc + jN + offset_C] += sums[ts]; } else if (TA==0 && TB==1) { C[iM ldc + jN + offset_C] += sums[ts]; } else { C[iM ldc + jN + offset_C] += sums[ts]; } } } } ^

1 error detected in the compilation of "/tmp/OCL6829T3.cl". Frontend phase failed compilation.

sowson commented 4 years ago

hi @zakamik go to src/blas_kernel.cl and put nothing into function or comment its body:

static void atomicAdd(volatile __global float a, float v) { union { float f; unsigned int i; } o; o.i = 0; union { float f; unsigned int i; } n; n.i = 1; do { o.f = a; n.f = o.f + v; } while (atom_cmpxchg((__global unsigned int *)a, o.i, n.i) != o.i); }

thx!

zakamik commented 4 years ago

Thank you, will try and let you know if it works

zakamik commented 4 years ago

This solution works for me. However, latest commit won't compile, had to revert to the previous one.

sowson commented 4 years ago

@zakamik can you do me a favor and check compile now? I changed the last commit. Thanks!

zakamik commented 4 years ago

Tried a number of compile options, with GPU always the same result:

Device IDs: 1 Device ID: 0 Device name: Oland Device vendor: Advanced Micro Devices, Inc. Device opencl availability: OpenCL 1.2 AMD-APP (3004.6) Device opencl used: 3004.6 Device double precision: YES Device max group size: 256 Device address bits: 64 FATAL ERROR: CL_COMPILE_PROGRAM_FAILURE

(src/blas_kernel.cl is the same as in repo)

sowson commented 4 years ago

@zakamik should be fine now... can you recheck... I had to squash some code :P. Pls let me know if we could close this issue?

zakamik commented 4 years ago

Will be able to check in a few days, pill post an update

sowson commented 4 years ago

I am 100% sure all works now! :D

zakamik commented 4 years ago

Unfortunately I had no luck with running new version:

./darknet detect cfg/yolov3.cfg yolov3.weights sample.jpg -thresh 0.1 opencl_init: Could not get platform IDs.

./darknet -i 0 detect cfg/yolov3.cfg yolov3.weights sample.jpg -thresh 0.1 Device IDs: 1 Device ID: 0 Device name: Oland Device vendor: Advanced Micro Devices, Inc. Device opencl availability: OpenCL 1.2 AMD-APP (3004.6) Device opencl used: 3004.6 Device double precision: YES Device max group size: 256 Device address bits: 64 FATAL ERROR: CL_INVALID_VALUE opencl_load: could not create header. error: CL_INVALID_VALUE

sowson commented 4 years ago

can you retry after now... ? in some moment and let me know? thx!

zakamik commented 4 years ago

Last version unfortunately has the same first issue, fixed by commenting the code in blas_kernels.cl but apart from that it works

sowson commented 4 years ago

tuning has to be dynamic, not 16... thanks for point that out :). can you re-check?

zakamik commented 4 years ago

Nothing has changed on my side, same function "atom_cmpxchg" declared implicitly error. commit 305c8df9f17d2729b6194a16cec83d71d9a92c53 (HEAD -> master, origin/master, origin/HEAD)

sowson commented 4 years ago

ok, I commented it, I am not using upsample layers, for now, once I will check it, maybe without atomic add it still be working fine. thx!