Closed tnorth closed 11 years ago
This issue seems related to the last few commits (didn't happen before), but reading the diffs I can hardly see from where it comes.
Compilation error pasted here. That's a lot of text, sorry, but it looks like I can't add an attached file here.
Also, there is some additional definitions that I added for exp() and so on. They don't use native_{cos,sin}, so I guess the problem is not related.
ERROR:root:Failed to compile: 1: 2: 3:#define CTX_FAST_MATH 4: 5: // taken from pyopencl._cluda 6: #define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE) 7: 8: // 'static' helps to avoid the "no previous prototype for function" warning 9: #if PYOPENCL_CL_VERSION >= 0x1020 10: #define WITHIN_KERNEL static 11: #else 12: #define WITHIN_KERNEL 13: #endif 14: 15: #define KERNEL __kernel 16: #define GLOBAL_MEM __global 17: #define LOCAL_MEM __local 18: #define LOCAL_MEM_DYNAMIC __local 19: #define LOCAL_MEM_ARG __local 20: #define INLINE inline 21: 22: #if defined(cl_khr_fp64) 23: #pragma OPENCL EXTENSION cl_khr_fp64: enable 24: #elif defined(cl_amd_fp64) 25: #pragma OPENCL EXTENSION cl_amd_fp64: enable 26: #endif 27: 28: 29: #define COMPLEX_CTR(T) (T) 30: 31: 32: 33: WITHIN_KERNEL double2 cdouble_exp(double2 (a)) 34: { 35: double expr = exp(a.x); 36: double cosi; 37: double sini = sincos(a.y, &cosi); 38: return COMPLEX_CTR(double2) (expr * cosi, expr * sini); 39: } 40: WITHIN_KERNEL float2 cfloat_exp(float2 (a)) 41: { 42: float expr = exp(a.x); 43: float cosi; 44: float sini = sincos(a.y, &cosi); 45: return COMPLEX_CTR(float2) (expr * cosi, expr * sini); 46: } 47: 48: 49: 50:#define virtual_local_id get_local_id 51:#define virtual_local_size get_local_size 52:#define virtual_group_id get_group_id 53:#define virtual_num_groups get_num_groups 54:#define virtual_global_id get_global_id 55:#define virtual_global_size get_global_size 56: 57:WITHIN_KERNEL int virtual_global_flat_size() 58:{ 59: return get_global_size(0) * get_global_size(1) * get_global_size(2); 60:} 61: 62:WITHIN_KERNEL int virtual_global_flat_id() 63:{ 64: return virtual_global_id(0) + 65: virtual_global_id(1) * virtual_global_size(0) + 66: virtual_global_id(2) * virtual_global_size(1) * virtual_global_size(0); 67:} 68: 69:#define VIRTUAL_SKIP_THREADS 70: 71: 72: 73:#define _LOAD__temp0(idx) (_leaf__temp0[idx]) 74: 75:#define _STORE__temp0(idx, val) _leaf__temp0[idx] = (val) 76: 77:// leaf node input 78:#define _LOAD_input(idx) (_leaf_input[idx]) 79: 80:#define SIGNATURE GLOBAL_MEM double2 *_leaf__temp0, GLOBAL_MEM double2 *_leaf_input, int _leaf_direction 81:WITHIN_KERNEL double2 __div__double2__double2_double( 82: double2 a, double b) 83:{ 84: 85: return COMPLEX_CTR(double2)(a.x / b, a.y / b); 86:} 87: 88: 89:WITHIN_KERNEL double2 __mul__double2__double2_double2( 90: double2 a, double2 b) 91:{ 92: 93: return COMPLEX_CTR(double2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); 94:} 95: 96: 97: 98: 99: 100: 101:#ifdef CUDA 102:#define mad24(x, y, z) ((x) * (y) + (z)) 103:#define mad(x, y, z) ((x) * (y) + (z)) 104:#define mul24(x, y) __mul24(x, y) 105:#endif 106: 107: 108:/* 109:#ifdef sincosf 110:#endif 111:#ifndef sincosf 112:#define complex_exp(res, ang) (res).x = native_cos(ang); (res).y = native_sin(ang) 113:#endif 114:*/ 115: 116:#define complex_ctr COMPLEX_CTR(double2) 117:#define complex_mul __mul__double2__double2_double2 118:#define complex_div_scalar __div__double2__double2_double 119:#define conj(a) complex_ctr((a).x, -(a).y) 120:#define conj_transp(a) complex_ctr(-(a).y, (a).x) 121:#define conj_transp_and_mul(a, b) complex_ctr(-(a).y * (b), (a).x * (b)) 122: 123:typedef double2 complex_t; 124:typedef double real_t; 125: 126: 127:WITHIN_KERNEL complex_t complex_exp(real_t ang) 128:{ 129: complex_t res; 130: 131:#ifdef CUDA 132: sincos(ang, &((res).y), &((res).x)); 133:#else 134:#ifdef CTX_FAST_MATH 135: res.x = native_cos(ang); 136: res.y = native_sin(ang); 137:#else 138: real_t tmp; 139: res.y = sincos(ang, &tmp); 140: res.x = tmp; 141:#endif 142:#endif 143: return res; 144:} 145: 146:WITHIN_KERNEL void swap(complex_t *a, complex_t *b) 147:{ 148: complex_t c = *a; 149: *a = *b; 150: *b = c; 151:} 152: 153:// shifts the sequence (a1, a2, a3, a4, a5) transforming it to 154:// (a5, a1, a2, a3, a4) 155:WITHIN_KERNEL void shift32( 156: complex_t *a1, complex_t *a2, complex_t *a3, complex_t *a4, complex_t *a5) 157:{ 158: complex_t c1, c2; 159: c1 = *a2; 160: *a2 = *a1; 161: c2 = *a3; 162: *a3 = c1; 163: c1 = *a4; 164: *a4 = c2; 165: c2 = *a5; 166: *a5 = c1; 167: *a1 = c2; 168:} 169: 170:WITHIN_KERNEL void _fftKernel2(complex_t *a) 171:{ 172: complex_t c = a[0]; 173: a[0] = c + a[1]; 174: a[1] = c - a[1]; 175:} 176:#define fftKernel2(a, direction) _fftKernel2(a) 177: 178:WITHIN_KERNEL void _fftKernel2S(complex_t *d1, complex_t *d2) 179:{ 180: complex_t c = *d1; 181: *d1 = c + *d2; 182: *d2 = c - *d2; 183:} 184:#define fftKernel2S(d1, d2, direction) _fftKernel2S(d1, d2) 185: 186:WITHIN_KERNEL void fftKernel4(complex_t *a, const int direction) 187:{ 188: fftKernel2S(a + 0, a + 2, direction); 189: fftKernel2S(a + 1, a + 3, direction); 190: fftKernel2S(a + 0, a + 1, direction); 191: a[3] = conj_transp_and_mul(a[3], direction); 192: fftKernel2S(a + 2, a + 3, direction); 193: swap(a + 1, a + 2); 194:} 195: 196:WITHIN_KERNEL void fftKernel4s(complex_t *a0, complex_t *a1, 197: complex_t *a2, complex_t *a3, const int direction) 198:{ 199: fftKernel2S(a0, a2, direction); 200: fftKernel2S(a1, a3, direction); 201: fftKernel2S(a0, a1, direction); 202: *a3 = conj_transp_and_mul(*a3, direction); 203: fftKernel2S(a2, a3, direction); 204: swap(a1, a2); 205:} 206: 207:WITHIN_KERNEL void bitreverse8(complex_t *a) 208:{ 209: swap(a + 1, a + 4); 210: swap(a + 3, a + 6); 211:} 212: 213:WITHIN_KERNEL void fftKernel8(complex_t *a, const int direction) 214:{ 215: const complex_t w1 = complex_ctr( 216: 0.70710678118654746, 217: 0.70710678118654746 * direction); 218: const complex_t w3 = complex_ctr( 219: -0.70710678118654746, 220: 0.70710678118654746 * direction); 221: fftKernel2S(a + 0, a + 4, direction); 222: fftKernel2S(a + 1, a + 5, direction); 223: fftKernel2S(a + 2, a + 6, direction); 224: fftKernel2S(a + 3, a + 7, direction); 225: a[5] = complex_mul(w1, a[5]); 226: a[6] = conj_transp_and_mul(a[6], direction); 227: a[7] = complex_mul(w3, a[7]); 228: fftKernel2S(a + 0, a + 2, direction); 229: fftKernel2S(a + 1, a + 3, direction); 230: fftKernel2S(a + 4, a + 6, direction); 231: fftKernel2S(a + 5, a + 7, direction); 232: a[3] = conj_transp_and_mul(a[3], direction); 233: a[7] = conj_transp_and_mul(a[7], direction); 234: fftKernel2S(a + 0, a + 1, direction); 235: fftKernel2S(a + 2, a + 3, direction); 236: fftKernel2S(a + 4, a + 5, direction); 237: fftKernel2S(a + 6, a + 7, direction); 238: bitreverse8(a); 239:} 240: 241:WITHIN_KERNEL void bitreverse4x4(complex_t *a) 242:{ 243: swap(a + 1, a + 4); 244: swap(a + 2, a + 8); 245: swap(a + 3, a + 12); 246: swap(a + 6, a + 9); 247: swap(a + 7, a + 13); 248: swap(a + 11, a + 14); 249:} 250: 251:WITHIN_KERNEL void fftKernel16(complex_t *a, const int direction) 252:{ 253: complex_t temp; 254: const real_t w0 = 0.92387953251128674; 255: const real_t w1 = 0.38268343236508978; 256: const real_t w2 = 0.70710678118654746; 257: fftKernel4s(a + 0, a + 4, a + 8, a + 12, direction); 258: fftKernel4s(a + 1, a + 5, a + 9, a + 13, direction); 259: fftKernel4s(a + 2, a + 6, a + 10, a + 14, direction); 260: fftKernel4s(a + 3, a + 7, a + 11, a + 15, direction); 261: 262: temp = complex_ctr(w0, direction * w1); 263: a[5] = complex_mul(a[5], temp); 264: temp = complex_ctr(w1, direction * w0); 265: a[7] = complex_mul(a[7], temp); 266: temp = complex_ctr(w2, direction * w2); 267: a[6] = complex_mul(a[6], temp); 268: a[9] = complex_mul(a[9], temp); 269: 270: a[10] = conj_transp_and_mul(a[10], direction); 271: 272: temp = complex_ctr(-w2, direction * w2); 273: a[11] = complex_mul(a[11], temp); 274: a[14] = complex_mul(a[14], temp); 275: temp = complex_ctr(w1, direction * w0); 276: a[13] = complex_mul(a[13], temp); 277: temp = complex_ctr(-w0, -direction * w1); 278: a[15] = complex_mul(a[15], temp); 279: 280: fftKernel4(a, direction); 281: fftKernel4(a + 4, direction); 282: fftKernel4(a + 8, direction); 283: fftKernel4(a + 12, direction); 284: bitreverse4x4(a); 285:} 286: 287:WITHIN_KERNEL void bitreverse32(complex_t *a) 288:{ 289: shift32(a + 1, a + 2, a + 4, a + 8, a + 16); 290: shift32(a + 3, a + 6, a + 12, a + 24, a + 17); 291: shift32(a + 5, a + 10, a + 20, a + 9, a + 18); 292: shift32(a + 7, a + 14, a + 28, a + 25, a + 19); 293: shift32(a + 11, a + 22, a + 13, a + 26, a + 21); 294: shift32(a + 15, a + 30, a + 29, a + 27, a + 23); 295:} 296: 297:WITHIN_KERNEL void fftKernel32(complex_t *a, const int direction) 298:{ 299: complex_t temp; 300: fftKernel2S(a + 0, a + 16, direction); 301: fftKernel2S(a + 1, a + 17, direction); 302: fftKernel2S(a + 2, a + 18, direction); 303: fftKernel2S(a + 3, a + 19, direction); 304: fftKernel2S(a + 4, a + 20, direction); 305: fftKernel2S(a + 5, a + 21, direction); 306: fftKernel2S(a + 6, a + 22, direction); 307: fftKernel2S(a + 7, a + 23, direction); 308: fftKernel2S(a + 8, a + 24, direction); 309: fftKernel2S(a + 9, a + 25, direction); 310: fftKernel2S(a + 10, a + 26, direction); 311: fftKernel2S(a + 11, a + 27, direction); 312: fftKernel2S(a + 12, a + 28, direction); 313: fftKernel2S(a + 13, a + 29, direction); 314: fftKernel2S(a + 14, a + 30, direction); 315: fftKernel2S(a + 15, a + 31, direction); 316: 317: temp = complex_ctr( 318: 0.98078528040323043, 319: 0.19509032201612825 320: ); 321: a[17] = complex_mul(a[17], temp); 322: temp = complex_ctr( 323: 0.92387953251128674, 324: 0.38268343236508978 325: ); 326: a[18] = complex_mul(a[18], temp); 327: temp = complex_ctr( 328: 0.83146961230254524, 329: 0.55557023301960218 330: ); 331: a[19] = complex_mul(a[19], temp); 332: temp = complex_ctr( 333: 0.70710678118654757, 334: 0.70710678118654746 335: ); 336: a[20] = complex_mul(a[20], temp); 337: temp = complex_ctr( 338: 0.55557023301960229, 339: 0.83146961230254524 340: ); 341: a[21] = complex_mul(a[21], temp); 342: temp = complex_ctr( 343: 0.38268343236508984, 344: 0.92387953251128674 345: ); 346: a[22] = complex_mul(a[22], temp); 347: temp = complex_ctr( 348: 0.19509032201612833, 349: 0.98078528040323043 350: ); 351: a[23] = complex_mul(a[23], temp); 352: temp = complex_ctr( 353: 6.123233995736766e-17, 354: 1.0 355: ); 356: a[24] = complex_mul(a[24], temp); 357: temp = complex_ctr( 358: -0.19509032201612819, 359: 0.98078528040323043 360: ); 361: a[25] = complex_mul(a[25], temp); 362: temp = complex_ctr( 363: -0.38268343236508973, 364: 0.92387953251128674 365: ); 366: a[26] = complex_mul(a[26], temp); 367: temp = complex_ctr( 368: -0.55557023301960196, 369: 0.83146961230254546 370: ); 371: a[27] = complex_mul(a[27], temp); 372: temp = complex_ctr( 373: -0.70710678118654746, 374: 0.70710678118654757 375: ); 376: a[28] = complex_mul(a[28], temp); 377: temp = complex_ctr( 378: -0.83146961230254535, 379: 0.55557023301960218 380: ); 381: a[29] = complex_mul(a[29], temp); 382: temp = complex_ctr( 383: -0.92387953251128674, 384: 0.38268343236508989 385: ); 386: a[30] = complex_mul(a[30], temp); 387: temp = complex_ctr( 388: -0.98078528040323043, 389: 0.19509032201612861 390: ); 391: a[31] = complex_mul(a[31], temp); 392: 393: fftKernel16(a, direction); 394: fftKernel16(a + 16, direction); 395: bitreverse32(a); 396:} 397: 398:// Calculates input and output weights for the Bluestein's algorithm 399:WITHIN_KERNEL complex_t xweight(int dir_coeff, int pos) 400:{ 401: // The modulo of 2 * fft_size_real does not change the result, 402: // but greatly improves the precision by keeping the argument of sin()/cos() small. 403: return complex_exp(dir_coeff * 4.7936899621426287e-05 * 404: ((pos * pos) % (2 * 65536)) ); 405:} 406: 407: 408: 409: 410: 411:KERNEL void fft_global(SIGNATURE) 412:{ 413: VIRTUAL_SKIP_THREADS; 414: 415: 416: 417: LOCAL_MEM real_t lmem[1032]; 418: size_t lmem_store_index, lmem_load_index; 419: 420: complex_t a[16]; 421: 422: int thread_id = virtual_local_id(0); 423: int group_id = virtual_group_id(0); 424: 425: int direction = _leaf_direction; 426: 427: int norm_coeff = direction == 1 ? 1 : 1; 428: 429: 430: int xform_global = group_id / 64; 431: int group_in_xform = group_id % 64; 432: int xform_local = thread_id / 8; 433: int thread_in_xform = thread_id % 8; 434: 435: int position_in_stride_in = thread_in_xform + group_in_xform * 8; 436: int xform_number = xform_global * 1; 437: 438: 439: { 440: int stride_in_number = xform_local + 0; 441: int position = position_in_stride_in + 512 * stride_in_number; 442: 443: 444: a[0] = _LOAD_input(position + 65536 * xform_number); 445: 446: } 447: { 448: int stride_in_number = xform_local + 8; 449: int position = position_in_stride_in + 512 * stride_in_number; 450: 451: 452: a[1] = _LOAD_input(position + 65536 * xform_number); 453: 454: } 455: { 456: int stride_in_number = xform_local + 16; 457: int position = position_in_stride_in + 512 * stride_in_number; 458: 459: 460: a[2] = _LOAD_input(position + 65536 * xform_number); 461: 462: } 463: { 464: int stride_in_number = xform_local + 24; 465: int position = position_in_stride_in + 512 * stride_in_number; 466: 467: 468: a[3] = _LOAD_input(position + 65536 * xform_number); 469: 470: } 471: { 472: int stride_in_number = xform_local + 32; 473: int position = position_in_stride_in + 512 * stride_in_number; 474: 475: 476: a[4] = _LOAD_input(position + 65536 * xform_number); 477: 478: } 479: { 480: int stride_in_number = xform_local + 40; 481: int position = position_in_stride_in + 512 * stride_in_number; 482: 483: 484: a[5] = _LOAD_input(position + 65536 * xform_number); 485: 486: } 487: { 488: int stride_in_number = xform_local + 48; 489: int position = position_in_stride_in + 512 * stride_in_number; 490: 491: 492: a[6] = _LOAD_input(position + 65536 * xform_number); 493: 494: } 495: { 496: int stride_in_number = xform_local + 56; 497: int position = position_in_stride_in + 512 * stride_in_number; 498: 499: 500: a[7] = _LOAD_input(position + 65536 * xform_number); 501: 502: } 503: { 504: int stride_in_number = xform_local + 64; 505: int position = position_in_stride_in + 512 * stride_in_number; 506: 507: 508: a[8] = _LOAD_input(position + 65536 * xform_number); 509: 510: } 511: { 512: int stride_in_number = xform_local + 72; 513: int position = position_in_stride_in + 512 * stride_in_number; 514: 515: 516: a[9] = _LOAD_input(position + 65536 * xform_number); 517: 518: } 519: { 520: int stride_in_number = xform_local + 80; 521: int position = position_in_stride_in + 512 * stride_in_number; 522: 523: 524: a[10] = _LOAD_input(position + 65536 * xform_number); 525: 526: } 527: { 528: int stride_in_number = xform_local + 88; 529: int position = position_in_stride_in + 512 * stride_in_number; 530: 531: 532: a[11] = _LOAD_input(position + 65536 * xform_number); 533: 534: } 535: { 536: int stride_in_number = xform_local + 96; 537: int position = position_in_stride_in + 512 * stride_in_number; 538: 539: 540: a[12] = _LOAD_input(position + 65536 * xform_number); 541: 542: } 543: { 544: int stride_in_number = xform_local + 104; 545: int position = position_in_stride_in + 512 * stride_in_number; 546: 547: 548: a[13] = _LOAD_input(position + 65536 * xform_number); 549: 550: } 551: { 552: int stride_in_number = xform_local + 112; 553: int position = position_in_stride_in + 512 * stride_in_number; 554: 555: 556: a[14] = _LOAD_input(position + 65536 * xform_number); 557: 558: } 559: { 560: int stride_in_number = xform_local + 120; 561: int position = position_in_stride_in + 512 * stride_in_number; 562: 563: 564: a[15] = _LOAD_input(position + 65536 * xform_number); 565: 566: } 567: 568: fftKernel16(a, direction); 569: 570: { 571: real_t ang; 572: complex_t w; 573: 574: ang = 0.04908738521234052 * xform_local * direction; 575: w = complex_exp(ang); 576: a[1] = complex_mul(a[1], w); 577: ang = 0.09817477042468103 * xform_local * direction; 578: w = complex_exp(ang); 579: a[2] = complex_mul(a[2], w); 580: ang = 0.14726215563702155 * xform_local * direction; 581: w = complex_exp(ang); 582: a[3] = complex_mul(a[3], w); 583: ang = 0.19634954084936207 * xform_local * direction; 584: w = complex_exp(ang); 585: a[4] = complex_mul(a[4], w); 586: ang = 0.2454369260617026 * xform_local * direction; 587: w = complex_exp(ang); 588: a[5] = complex_mul(a[5], w); 589: ang = 0.2945243112740431 * xform_local * direction; 590: w = complex_exp(ang); 591: a[6] = complex_mul(a[6], w); 592: ang = 0.3436116964863836 * xform_local * direction; 593: w = complex_exp(ang); 594: a[7] = complex_mul(a[7], w); 595: ang = 0.39269908169872414 * xform_local * direction; 596: w = complex_exp(ang); 597: a[8] = complex_mul(a[8], w); 598: ang = 0.44178646691106466 * xform_local * direction; 599: w = complex_exp(ang); 600: a[9] = complex_mul(a[9], w); 601: ang = 0.4908738521234052 * xform_local * direction; 602: w = complex_exp(ang); 603: a[10] = complex_mul(a[10], w); 604: ang = 0.5399612373357456 * xform_local * direction; 605: w = complex_exp(ang); 606: a[11] = complex_mul(a[11], w); 607: ang = 0.5890486225480862 * xform_local * direction; 608: w = complex_exp(ang); 609: a[12] = complex_mul(a[12], w); 610: ang = 0.6381360077604268 * xform_local * direction; 611: w = complex_exp(ang); 612: a[13] = complex_mul(a[13], w); 613: ang = 0.6872233929727672 * xform_local * direction; 614: w = complex_exp(ang); 615: a[14] = complex_mul(a[14], w); 616: ang = 0.7363107781851077 * xform_local * direction; 617: w = complex_exp(ang); 618: a[15] = complex_mul(a[15], w); 619: } 620: 621: lmem_store_index = thread_id; 622: lmem_load_index = mad24(xform_local, 128, thread_in_xform); 623: 624: lmem[lmem_store_index + 0] = a[0].x; 625: lmem[lmem_store_index + 64] = a[1].x; 626: lmem[lmem_store_index + 128] = a[2].x; 627: lmem[lmem_store_index + 192] = a[3].x; 628: lmem[lmem_store_index + 256] = a[4].x; 629: lmem[lmem_store_index + 320] = a[5].x; 630: lmem[lmem_store_index + 384] = a[6].x; 631: lmem[lmem_store_index + 448] = a[7].x; 632: lmem[lmem_store_index + 512] = a[8].x; 633: lmem[lmem_store_index + 576] = a[9].x; 634: lmem[lmem_store_index + 640] = a[10].x; 635: lmem[lmem_store_index + 704] = a[11].x; 636: lmem[lmem_store_index + 768] = a[12].x; 637: lmem[lmem_store_index + 832] = a[13].x; 638: lmem[lmem_store_index + 896] = a[14].x; 639: lmem[lmem_store_index + 960] = a[15].x; 640: LOCAL_BARRIER; 641: 642: a[0].x = 643: lmem[lmem_load_index + 0]; 644: a[1].x = 645: lmem[lmem_load_index + 8]; 646: a[2].x = 647: lmem[lmem_load_index + 16]; 648: a[3].x = 649: lmem[lmem_load_index + 24]; 650: a[4].x = 651: lmem[lmem_load_index + 32]; 652: a[5].x = 653: lmem[lmem_load_index + 40]; 654: a[6].x = 655: lmem[lmem_load_index + 48]; 656: a[7].x = 657: lmem[lmem_load_index + 56]; 658: a[8].x = 659: lmem[lmem_load_index + 64]; 660: a[9].x = 661: lmem[lmem_load_index + 72]; 662: a[10].x = 663: lmem[lmem_load_index + 80]; 664: a[11].x = 665: lmem[lmem_load_index + 88]; 666: a[12].x = 667: lmem[lmem_load_index + 96]; 668: a[13].x = 669: lmem[lmem_load_index + 104]; 670: a[14].x = 671: lmem[lmem_load_index + 112]; 672: a[15].x = 673: lmem[lmem_load_index + 120]; 674: LOCAL_BARRIER; 675: lmem[lmem_store_index + 0] = a[0].y; 676: lmem[lmem_store_index + 64] = a[1].y; 677: lmem[lmem_store_index + 128] = a[2].y; 678: lmem[lmem_store_index + 192] = a[3].y; 679: lmem[lmem_store_index + 256] = a[4].y; 680: lmem[lmem_store_index + 320] = a[5].y; 681: lmem[lmem_store_index + 384] = a[6].y; 682: lmem[lmem_store_index + 448] = a[7].y; 683: lmem[lmem_store_index + 512] = a[8].y; 684: lmem[lmem_store_index + 576] = a[9].y; 685: lmem[lmem_store_index + 640] = a[10].y; 686: lmem[lmem_store_index + 704] = a[11].y; 687: lmem[lmem_store_index + 768] = a[12].y; 688: lmem[lmem_store_index + 832] = a[13].y; 689: lmem[lmem_store_index + 896] = a[14].y; 690: lmem[lmem_store_index + 960] = a[15].y; 691: LOCAL_BARRIER; 692: 693: a[0].y = 694: lmem[lmem_load_index + 0]; 695: a[1].y = 696: lmem[lmem_load_index + 8]; 697: a[2].y = 698: lmem[lmem_load_index + 16]; 699: a[3].y = 700: lmem[lmem_load_index + 24]; 701: a[4].y = 702: lmem[lmem_load_index + 32]; 703: a[5].y = 704: lmem[lmem_load_index + 40]; 705: a[6].y = 706: lmem[lmem_load_index + 48]; 707: a[7].y = 708: lmem[lmem_load_index + 56]; 709: a[8].y = 710: lmem[lmem_load_index + 64]; 711: a[9].y = 712: lmem[lmem_load_index + 72]; 713: a[10].y = 714: lmem[lmem_load_index + 80]; 715: a[11].y = 716: lmem[lmem_load_index + 88]; 717: a[12].y = 718: lmem[lmem_load_index + 96]; 719: a[13].y = 720: lmem[lmem_load_index + 104]; 721: a[14].y = 722: lmem[lmem_load_index + 112]; 723: a[15].y = 724: lmem[lmem_load_index + 120]; 725: LOCAL_BARRIER; 726: 727: fftKernel8(a + 0, direction); 728: fftKernel8(a + 8, direction); 729: 730: { 731: real_t ang1, ang; 732: complex_t w; 733: 734: int l = (group_in_xform * 8 + thread_in_xform) / 1; 735: int k = xform_local * 2; 736: ang1 = 9.587379924285257e-05 * l * direction; 737: ang = ang1 * (k + 0); 738: w = complex_exp(ang); 739: a[0] = complex_mul(a[0], w); 740: ang = ang1 * (k + 16); 741: w = complex_exp(ang); 742: a[1] = complex_mul(a[1], w); 743: ang = ang1 * (k + 32); 744: w = complex_exp(ang); 745: a[2] = complex_mul(a[2], w); 746: ang = ang1 * (k + 48); 747: w = complex_exp(ang); 748: a[3] = complex_mul(a[3], w); 749: ang = ang1 * (k + 64); 750: w = complex_exp(ang); 751: a[4] = complex_mul(a[4], w); 752: ang = ang1 * (k + 80); 753: w = complex_exp(ang); 754: a[5] = complex_mul(a[5], w); 755: ang = ang1 * (k + 96); 756: w = complex_exp(ang); 757: a[6] = complex_mul(a[6], w); 758: ang = ang1 * (k + 112); 759: w = complex_exp(ang); 760: a[7] = complex_mul(a[7], w); 761: ang = ang1 * (k + 1); 762: w = complex_exp(ang); 763: a[8] = complex_mul(a[8], w); 764: ang = ang1 * (k + 17); 765: w = complex_exp(ang); 766: a[9] = complex_mul(a[9], w); 767: ang = ang1 * (k + 33); 768: w = complex_exp(ang); 769: a[10] = complex_mul(a[10], w); 770: ang = ang1 * (k + 49); 771: w = complex_exp(ang); 772: a[11] = complex_mul(a[11], w); 773: ang = ang1 * (k + 65); 774: w = complex_exp(ang); 775: a[12] = complex_mul(a[12], w); 776: ang = ang1 * (k + 81); 777: w = complex_exp(ang); 778: a[13] = complex_mul(a[13], w); 779: ang = ang1 * (k + 97); 780: w = complex_exp(ang); 781: a[14] = complex_mul(a[14], w); 782: ang = ang1 * (k + 113); 783: w = complex_exp(ang); 784: a[15] = complex_mul(a[15], w); 785: } 786: 787: lmem_store_index = mad24(thread_in_xform, 129, xform_local * 2); 788: lmem_load_index = mad24(thread_id / 128, 129, thread_id % 128); 789: 790: lmem[lmem_store_index + 0] = a[0].x; 791: lmem[lmem_store_index + 16] = a[1].x; 792: lmem[lmem_store_index + 32] = a[2].x; 793: lmem[lmem_store_index + 48] = a[3].x; 794: lmem[lmem_store_index + 64] = a[4].x; 795: lmem[lmem_store_index + 80] = a[5].x; 796: lmem[lmem_store_index + 96] = a[6].x; 797: lmem[lmem_store_index + 112] = a[7].x; 798: lmem[lmem_store_index + 1] = a[8].x; 799: lmem[lmem_store_index + 17] = a[9].x; 800: lmem[lmem_store_index + 33] = a[10].x; 801: lmem[lmem_store_index + 49] = a[11].x; 802: lmem[lmem_store_index + 65] = a[12].x; 803: lmem[lmem_store_index + 81] = a[13].x; 804: lmem[lmem_store_index + 97] = a[14].x; 805: lmem[lmem_store_index + 113] = a[15].x; 806: LOCAL_BARRIER; 807: 808: 809: a[0].x = lmem[lmem_load_index + 0]; 810: a[1].x = lmem[lmem_load_index + 64]; 811: a[2].x = lmem[lmem_load_index + 129]; 812: a[3].x = lmem[lmem_load_index + 193]; 813: a[4].x = lmem[lmem_load_index + 258]; 814: a[5].x = lmem[lmem_load_index + 322]; 815: a[6].x = lmem[lmem_load_index + 387]; 816: a[7].x = lmem[lmem_load_index + 451]; 817: a[8].x = lmem[lmem_load_index + 516]; 818: a[9].x = lmem[lmem_load_index + 580]; 819: a[10].x = lmem[lmem_load_index + 645]; 820: a[11].x = lmem[lmem_load_index + 709]; 821: a[12].x = lmem[lmem_load_index + 774]; 822: a[13].x = lmem[lmem_load_index + 838]; 823: a[14].x = lmem[lmem_load_index + 903]; 824: a[15].x = lmem[lmem_load_index + 967]; 825: LOCAL_BARRIER; 826: lmem[lmem_store_index + 0] = a[0].y; 827: lmem[lmem_store_index + 16] = a[1].y; 828: lmem[lmem_store_index + 32] = a[2].y; 829: lmem[lmem_store_index + 48] = a[3].y; 830: lmem[lmem_store_index + 64] = a[4].y; 831: lmem[lmem_store_index + 80] = a[5].y; 832: lmem[lmem_store_index + 96] = a[6].y; 833: lmem[lmem_store_index + 112] = a[7].y; 834: lmem[lmem_store_index + 1] = a[8].y; 835: lmem[lmem_store_index + 17] = a[9].y; 836: lmem[lmem_store_index + 33] = a[10].y; 837: lmem[lmem_store_index + 49] = a[11].y; 838: lmem[lmem_store_index + 65] = a[12].y; 839: lmem[lmem_store_index + 81] = a[13].y; 840: lmem[lmem_store_index + 97] = a[14].y; 841: lmem[lmem_store_index + 113] = a[15].y; 842: LOCAL_BARRIER; 843: 844: 845: a[0].y = lmem[lmem_load_index + 0]; 846: a[1].y = lmem[lmem_load_index + 64]; 847: a[2].y = lmem[lmem_load_index + 129]; 848: a[3].y = lmem[lmem_load_index + 193]; 849: a[4].y = lmem[lmem_load_index + 258]; 850: a[5].y = lmem[lmem_load_index + 322]; 851: a[6].y = lmem[lmem_load_index + 387]; 852: a[7].y = lmem[lmem_load_index + 451]; 853: a[8].y = lmem[lmem_load_index + 516]; 854: a[9].y = lmem[lmem_load_index + 580]; 855: a[10].y = lmem[lmem_load_index + 645]; 856: a[11].y = lmem[lmem_load_index + 709]; 857: a[12].y = lmem[lmem_load_index + 774]; 858: a[13].y = lmem[lmem_load_index + 838]; 859: a[14].y = lmem[lmem_load_index + 903]; 860: a[15].y = lmem[lmem_load_index + 967]; 861: LOCAL_BARRIER; 862: 863: int position_in_stride_out = (group_in_xform * 8) % 1; 864: int stride_out_number = (group_in_xform * 8) / 1; 865: int idx = stride_out_number * 128 + position_in_stride_out + thread_id + 866: 65536 * xform_number; 867: 868: { 869: int position = stride_out_number * 128 + 0 + 870: position_in_stride_out + thread_id; 871: _STORE__temp0(position + 65536 * xform_number, 872: complex_div_scalar(a[0], norm_coeff)); 873: } 874: { 875: int position = stride_out_number * 128 + 64 + 876: position_in_stride_out + thread_id; 877: _STORE__temp0(position + 65536 * xform_number, 878: complex_div_scalar(a[1], norm_coeff)); 879: } 880: { 881: int position = stride_out_number * 128 + 128 + 882: position_in_stride_out + thread_id; 883: _STORE__temp0(position + 65536 * xform_number, 884: complex_div_scalar(a[2], norm_coeff)); 885: } 886: { 887: int position = stride_out_number * 128 + 192 + 888: position_in_stride_out + thread_id; 889: _STORE__temp0(position + 65536 * xform_number, 890: complex_div_scalar(a[3], norm_coeff)); 891: } 892: { 893: int position = stride_out_number * 128 + 256 + 894: position_in_stride_out + thread_id; 895: _STORE__temp0(position + 65536 * xform_number, 896: complex_div_scalar(a[4], norm_coeff)); 897: } 898: { 899: int position = stride_out_number * 128 + 320 + 900: position_in_stride_out + thread_id; 901: _STORE__temp0(position + 65536 * xform_number, 902: complex_div_scalar(a[5], norm_coeff)); 903: } 904: { 905: int position = stride_out_number * 128 + 384 + 906: position_in_stride_out + thread_id; 907: _STORE__temp0(position + 65536 * xform_number, 908: complex_div_scalar(a[6], norm_coeff)); 909: } 910: { 911: int position = stride_out_number * 128 + 448 + 912: position_in_stride_out + thread_id; 913: _STORE__temp0(position + 65536 * xform_number, 914: complex_div_scalar(a[7], norm_coeff)); 915: } 916: { 917: int position = stride_out_number * 128 + 512 + 918: position_in_stride_out + thread_id; 919: _STORE__temp0(position + 65536 * xform_number, 920: complex_div_scalar(a[8], norm_coeff)); 921: } 922: { 923: int position = stride_out_number * 128 + 576 + 924: position_in_stride_out + thread_id; 925: _STORE__temp0(position + 65536 * xform_number, 926: complex_div_scalar(a[9], norm_coeff)); 927: } 928: { 929: int position = stride_out_number * 128 + 640 + 930: position_in_stride_out + thread_id; 931: _STORE__temp0(position + 65536 * xform_number, 932: complex_div_scalar(a[10], norm_coeff)); 933: } 934: { 935: int position = stride_out_number * 128 + 704 + 936: position_in_stride_out + thread_id; 937: _STORE__temp0(position + 65536 * xform_number, 938: complex_div_scalar(a[11], norm_coeff)); 939: } 940: { 941: int position = stride_out_number * 128 + 768 + 942: position_in_stride_out + thread_id; 943: _STORE__temp0(position + 65536 * xform_number, 944: complex_div_scalar(a[12], norm_coeff)); 945: } 946: { 947: int position = stride_out_number * 128 + 832 + 948: position_in_stride_out + thread_id; 949: _STORE__temp0(position + 65536 * xform_number, 950: complex_div_scalar(a[13], norm_coeff)); 951: } 952: { 953: int position = stride_out_number * 128 + 896 + 954: position_in_stride_out + thread_id; 955: _STORE__temp0(position + 65536 * xform_number, 956: complex_div_scalar(a[14], norm_coeff)); 957: } 958: { 959: int position = stride_out_number * 128 + 960 + 960: position_in_stride_out + thread_id; 961: _STORE__temp0(position + 65536 * xform_number, 962: complex_div_scalar(a[15], norm_coeff)); 963: } 964:} 965: 966: Traceback (most recent call last): File "/home/tnorth/exp_sim/confparser.py", line 157, in <module> exp.run(iter) File "/home/tnorth/exp_sim/experiment.py", line 128, in run previous_device.run() File "/home/tnorth/exp_sim/sparam.py", line 73, in add result = method(self, *args, **kw) File "/home/tnorth/exp_sim/sparam.py", line 82, in timed result = method(self, *args, **kw) File "/home/tnorth/exp_sim/fiber.py", line 95, in run self.propag_functions[with_gpu][coupled][self.propag_func](self.EtIn['fiber_in']) File "/home/tnorth/exp_sim/fiber.py", line 898, in RK4IP_method_GPU NLOpsAf.prepare_for(Af__gpu, A_gpu, 1) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/computation.py", line 207, in prepare_for self._operations = self._construct_operations(self._basis, self._ctx.device_params) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/fft.py", line 554, in _construct_operations inplace=([(mem_out, mem_in)] if kernel.inplace_possible else None)) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/operation.py", line 99, in add_kernel op.prepare(self._ctx, self._tr_tree) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/operation.py", line 195, in prepare self.global_size, local_size=self.local_size) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 153, in compile_static local_size=local_size, render_kwds=render_kwds) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 264, in __init__ stub_module = ctx._compile(str(prelude + stub_vsize_funcs + src)) File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 140, in _compile module = cl.Program(self._context, src).build(options=options) File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/__init__.py", line 124, in build cache_dir=cache_dir) File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/cache.py", line 460, in create_built_program_from_source_cached ctx, src, options, devices, cache_dir) File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/cache.py", line 384, in _create_built_program_from_source_cached prg.build(options, [devices[i] for i in to_be_built_indices]) File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/__init__.py", line 377, in program_build raise err pyopencl.RuntimeError: clBuildProgram failed: build program failure - Build on <pyopencl.Device 'GeForce GTX 550 Ti' on 'NVIDIA CUDA' at 0x2cf9020>: :135:13: error: call to 'native_cos' is ambiguous res.x = native_cos(ang); ^~~~~~~~~~ <built-in>:870:24: note: candidate function float __OVERLOADABLE__ native_cos(float); ^ <built-in>:871:25: note: candidate function float2 __OVERLOADABLE__ native_cos(float2); ^ <built-in>:873:25: note: candidate function float3 __OVERLOADABLE__ native_cos(float3); ^ <built-in>:875:25: note: candidate function float4 __OVERLOADABLE__ native_cos(float4); ^ <built-in>:876:25: note: candidate function float8 __OVERLOADABLE__ native_cos(float8); ^ <built-in>:877:26: note: candidate function float16 __OVERLOADABLE__ native_cos(float16); ^ :136:13: error: call to 'native_sin' is ambiguous res.y = native_sin(ang); ^~~~~~~~~~ <built-in>:950:24: note: candidate function float __OVERLOADABLE__ native_sin(float); ^ <built-in>:951:25: note: candidate function float2 __OVERLOADABLE__ native_sin(float2); ^ <built-in>:953:25: note: candidate function float3 __OVERLOADABLE__ native_sin(float3); ^ <built-in>:955:25: note: candidate function float4 __OVERLOADABLE__ native_sin(float4); ^ <built-in>:956:25: note: candidate function float8 __OVERLOADABLE__ native_sin(float8); ^ <built-in>:957:26: note: candidate function float16 __OVERLOADABLE__ native_sin(float16);
That's why I should run tests after bugfixes. It seems that native_sin() and native_cos() do not work with double, only float.
native_sin()
native_cos()
double
float
Thank you ! (I didn't know that)
This issue seems related to the last few commits (didn't happen before), but reading the diffs I can hardly see from where it comes.
Compilation error pasted here. That's a lot of text, sorry, but it looks like I can't add an attached file here.
Also, there is some additional definitions that I added for exp() and so on. They don't use native_{cos,sin}, so I guess the problem is not related.