fjarri / reikna

Pure Python GPGPU library
http://reikna.publicfields.net/
MIT License
164 stars 16 forks source link

Fast math issue: call to 'native_cos' is ambiguous #3

Closed tnorth closed 11 years ago

tnorth commented 11 years ago

This issue seems related to the last few commits (didn't happen before), but reading the diffs I can hardly see from where it comes.

Compilation error pasted here. That's a lot of text, sorry, but it looks like I can't add an attached file here.

Also, there is some additional definitions that I added for exp() and so on. They don't use native_{cos,sin}, so I guess the problem is not related.

ERROR:root:Failed to compile:
1:
2:
3:#define CTX_FAST_MATH
4:
5:    // taken from pyopencl._cluda
6:    #define LOCAL_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
7:
8:    // 'static' helps to avoid the "no previous prototype for function" warning
9:    #if PYOPENCL_CL_VERSION >= 0x1020
10:    #define WITHIN_KERNEL static
11:    #else
12:    #define WITHIN_KERNEL
13:    #endif
14:
15:    #define KERNEL __kernel
16:    #define GLOBAL_MEM __global
17:    #define LOCAL_MEM __local
18:    #define LOCAL_MEM_DYNAMIC __local
19:    #define LOCAL_MEM_ARG __local
20:    #define INLINE inline
21:
22:    #if defined(cl_khr_fp64)
23:    #pragma OPENCL EXTENSION cl_khr_fp64: enable
24:    #elif defined(cl_amd_fp64)
25:    #pragma OPENCL EXTENSION cl_amd_fp64: enable
26:    #endif
27:
28:
29:    #define COMPLEX_CTR(T) (T)
30:
31:
32:
33:    WITHIN_KERNEL double2 cdouble_exp(double2 (a))
34:    {
35:         double expr = exp(a.x);
36:         double cosi; 
37:         double sini = sincos(a.y, &cosi);
38:         return COMPLEX_CTR(double2) (expr * cosi, expr * sini);
39:    }
40:    WITHIN_KERNEL float2 cfloat_exp(float2 (a))
41:    {
42:         float expr = exp(a.x);
43:         float cosi; 
44:         float sini = sincos(a.y, &cosi);
45:         return COMPLEX_CTR(float2) (expr * cosi, expr * sini);
46:    }
47:
48:
49:
50:#define virtual_local_id get_local_id
51:#define virtual_local_size get_local_size
52:#define virtual_group_id get_group_id
53:#define virtual_num_groups get_num_groups
54:#define virtual_global_id get_global_id
55:#define virtual_global_size get_global_size
56:
57:WITHIN_KERNEL int virtual_global_flat_size()
58:{
59:    return get_global_size(0) * get_global_size(1) * get_global_size(2);
60:}
61:
62:WITHIN_KERNEL int virtual_global_flat_id()
63:{
64:    return virtual_global_id(0) +
65:        virtual_global_id(1) * virtual_global_size(0) +
66:        virtual_global_id(2) * virtual_global_size(1) * virtual_global_size(0);
67:}
68:
69:#define VIRTUAL_SKIP_THREADS
70:
71:
72:
73:#define _LOAD__temp0(idx) (_leaf__temp0[idx])
74:
75:#define _STORE__temp0(idx, val) _leaf__temp0[idx] = (val)
76:
77:// leaf node input
78:#define _LOAD_input(idx) (_leaf_input[idx])
79:
80:#define SIGNATURE GLOBAL_MEM double2 *_leaf__temp0, GLOBAL_MEM double2 *_leaf_input,  int _leaf_direction
81:WITHIN_KERNEL double2 __div__double2__double2_double(
82:    double2 a, double b)
83:{
84:
85:    return COMPLEX_CTR(double2)(a.x / b, a.y / b);
86:}
87:
88:
89:WITHIN_KERNEL double2 __mul__double2__double2_double2(
90:    double2 a, double2 b)
91:{
92:
93:    return COMPLEX_CTR(double2)(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
94:}
95:
96:
97:
98:
99:
100:
101:#ifdef CUDA
102:#define mad24(x, y, z) ((x) * (y) + (z))
103:#define mad(x, y, z) ((x) * (y) + (z))
104:#define mul24(x, y) __mul24(x, y)
105:#endif
106:
107:
108:/*
109:#ifdef sincosf
110:#endif
111:#ifndef sincosf
112:#define complex_exp(res, ang) (res).x = native_cos(ang); (res).y = native_sin(ang)
113:#endif
114:*/
115:
116:#define complex_ctr COMPLEX_CTR(double2)
117:#define complex_mul __mul__double2__double2_double2
118:#define complex_div_scalar __div__double2__double2_double
119:#define conj(a) complex_ctr((a).x, -(a).y)
120:#define conj_transp(a) complex_ctr(-(a).y, (a).x)
121:#define conj_transp_and_mul(a, b) complex_ctr(-(a).y * (b), (a).x * (b))
122:
123:typedef double2 complex_t;
124:typedef double real_t;
125:
126:
127:WITHIN_KERNEL complex_t complex_exp(real_t ang)
128:{
129:    complex_t res;
130:
131:#ifdef CUDA
132:    sincos(ang, &((res).y), &((res).x));
133:#else
134:#ifdef CTX_FAST_MATH
135:    res.x = native_cos(ang);
136:    res.y = native_sin(ang);
137:#else
138:    real_t tmp;
139:    res.y = sincos(ang, &tmp);
140:    res.x = tmp;
141:#endif
142:#endif
143:    return res;
144:}
145:
146:WITHIN_KERNEL void swap(complex_t *a, complex_t *b)
147:{
148:    complex_t c = *a;
149:    *a = *b;
150:    *b = c;
151:}
152:
153:// shifts the sequence (a1, a2, a3, a4, a5) transforming it to
154:// (a5, a1, a2, a3, a4)
155:WITHIN_KERNEL void shift32(
156:    complex_t *a1, complex_t *a2, complex_t *a3, complex_t *a4, complex_t *a5)
157:{
158:    complex_t c1, c2;
159:    c1 = *a2;
160:    *a2 = *a1;
161:    c2 = *a3;
162:    *a3 = c1;
163:    c1 = *a4;
164:    *a4 = c2;
165:    c2 = *a5;
166:    *a5 = c1;
167:    *a1 = c2;
168:}
169:
170:WITHIN_KERNEL void _fftKernel2(complex_t *a)
171:{
172:    complex_t c = a[0];
173:    a[0] = c + a[1];
174:    a[1] = c - a[1];
175:}
176:#define fftKernel2(a, direction) _fftKernel2(a)
177:
178:WITHIN_KERNEL void _fftKernel2S(complex_t *d1, complex_t *d2)
179:{
180:    complex_t c = *d1;
181:    *d1 = c + *d2;
182:    *d2 = c - *d2;
183:}
184:#define fftKernel2S(d1, d2, direction) _fftKernel2S(d1, d2)
185:
186:WITHIN_KERNEL void fftKernel4(complex_t *a, const int direction)
187:{
188:    fftKernel2S(a + 0, a + 2, direction);
189:    fftKernel2S(a + 1, a + 3, direction);
190:    fftKernel2S(a + 0, a + 1, direction);
191:    a[3] = conj_transp_and_mul(a[3], direction);
192:    fftKernel2S(a + 2, a + 3, direction);
193:    swap(a + 1, a + 2);
194:}
195:
196:WITHIN_KERNEL void fftKernel4s(complex_t *a0, complex_t *a1,
197:    complex_t *a2, complex_t *a3, const int direction)
198:{
199:    fftKernel2S(a0, a2, direction);
200:    fftKernel2S(a1, a3, direction);
201:    fftKernel2S(a0, a1, direction);
202:    *a3 = conj_transp_and_mul(*a3, direction);
203:    fftKernel2S(a2, a3, direction);
204:    swap(a1, a2);
205:}
206:
207:WITHIN_KERNEL void bitreverse8(complex_t *a)
208:{
209:    swap(a + 1, a + 4);
210:    swap(a + 3, a + 6);
211:}
212:
213:WITHIN_KERNEL void fftKernel8(complex_t *a, const int direction)
214:{
215:    const complex_t w1  = complex_ctr(
216:        0.70710678118654746,
217:        0.70710678118654746 * direction);
218:    const complex_t w3  = complex_ctr(
219:        -0.70710678118654746,
220:        0.70710678118654746 * direction);
221:    fftKernel2S(a + 0, a + 4, direction);
222:    fftKernel2S(a + 1, a + 5, direction);
223:    fftKernel2S(a + 2, a + 6, direction);
224:    fftKernel2S(a + 3, a + 7, direction);
225:    a[5] = complex_mul(w1, a[5]);
226:    a[6] = conj_transp_and_mul(a[6], direction);
227:    a[7] = complex_mul(w3, a[7]);
228:    fftKernel2S(a + 0, a + 2, direction);
229:    fftKernel2S(a + 1, a + 3, direction);
230:    fftKernel2S(a + 4, a + 6, direction);
231:    fftKernel2S(a + 5, a + 7, direction);
232:    a[3] = conj_transp_and_mul(a[3], direction);
233:    a[7] = conj_transp_and_mul(a[7], direction);
234:    fftKernel2S(a + 0, a + 1, direction);
235:    fftKernel2S(a + 2, a + 3, direction);
236:    fftKernel2S(a + 4, a + 5, direction);
237:    fftKernel2S(a + 6, a + 7, direction);
238:    bitreverse8(a);
239:}
240:
241:WITHIN_KERNEL void bitreverse4x4(complex_t *a)
242:{
243:    swap(a + 1, a + 4);
244:    swap(a + 2, a + 8);
245:    swap(a + 3, a + 12);
246:    swap(a + 6, a + 9);
247:    swap(a + 7, a + 13);
248:    swap(a + 11, a + 14);
249:}
250:
251:WITHIN_KERNEL void fftKernel16(complex_t *a, const int direction)
252:{
253:    complex_t temp;
254:    const real_t w0 = 0.92387953251128674;
255:    const real_t w1 = 0.38268343236508978;
256:    const real_t w2 = 0.70710678118654746;
257:    fftKernel4s(a + 0, a + 4, a + 8,  a + 12, direction);
258:    fftKernel4s(a + 1, a + 5, a + 9,  a + 13, direction);
259:    fftKernel4s(a + 2, a + 6, a + 10, a + 14, direction);
260:    fftKernel4s(a + 3, a + 7, a + 11, a + 15, direction);
261:
262:    temp  = complex_ctr(w0, direction * w1);
263:    a[5]  = complex_mul(a[5], temp);
264:    temp  = complex_ctr(w1, direction * w0);
265:    a[7]  = complex_mul(a[7], temp);
266:    temp  = complex_ctr(w2, direction * w2);
267:    a[6]  = complex_mul(a[6], temp);
268:    a[9]  = complex_mul(a[9], temp);
269:
270:    a[10] = conj_transp_and_mul(a[10], direction);
271:
272:    temp  = complex_ctr(-w2, direction * w2);
273:    a[11] = complex_mul(a[11], temp);
274:    a[14] = complex_mul(a[14], temp);
275:    temp  = complex_ctr(w1, direction * w0);
276:    a[13] = complex_mul(a[13], temp);
277:    temp  = complex_ctr(-w0, -direction * w1);
278:    a[15] = complex_mul(a[15], temp);
279:
280:    fftKernel4(a, direction);
281:    fftKernel4(a + 4, direction);
282:    fftKernel4(a + 8, direction);
283:    fftKernel4(a + 12, direction);
284:    bitreverse4x4(a);
285:}
286:
287:WITHIN_KERNEL void bitreverse32(complex_t *a)
288:{
289:    shift32(a + 1, a + 2, a + 4, a + 8, a + 16);
290:    shift32(a + 3, a + 6, a + 12, a + 24, a + 17);
291:    shift32(a + 5, a + 10, a + 20, a + 9, a + 18);
292:    shift32(a + 7, a + 14, a + 28, a + 25, a + 19);
293:    shift32(a + 11, a + 22, a + 13, a + 26, a + 21);
294:    shift32(a + 15, a + 30, a + 29, a + 27, a + 23);
295:}
296:
297:WITHIN_KERNEL void fftKernel32(complex_t *a, const int direction)
298:{
299:    complex_t temp;
300:        fftKernel2S(a + 0, a + 16, direction);
301:        fftKernel2S(a + 1, a + 17, direction);
302:        fftKernel2S(a + 2, a + 18, direction);
303:        fftKernel2S(a + 3, a + 19, direction);
304:        fftKernel2S(a + 4, a + 20, direction);
305:        fftKernel2S(a + 5, a + 21, direction);
306:        fftKernel2S(a + 6, a + 22, direction);
307:        fftKernel2S(a + 7, a + 23, direction);
308:        fftKernel2S(a + 8, a + 24, direction);
309:        fftKernel2S(a + 9, a + 25, direction);
310:        fftKernel2S(a + 10, a + 26, direction);
311:        fftKernel2S(a + 11, a + 27, direction);
312:        fftKernel2S(a + 12, a + 28, direction);
313:        fftKernel2S(a + 13, a + 29, direction);
314:        fftKernel2S(a + 14, a + 30, direction);
315:        fftKernel2S(a + 15, a + 31, direction);
316:
317:        temp = complex_ctr(
318:            0.98078528040323043,
319:            0.19509032201612825
320:        );
321:        a[17] = complex_mul(a[17], temp);
322:        temp = complex_ctr(
323:            0.92387953251128674,
324:            0.38268343236508978
325:        );
326:        a[18] = complex_mul(a[18], temp);
327:        temp = complex_ctr(
328:            0.83146961230254524,
329:            0.55557023301960218
330:        );
331:        a[19] = complex_mul(a[19], temp);
332:        temp = complex_ctr(
333:            0.70710678118654757,
334:            0.70710678118654746
335:        );
336:        a[20] = complex_mul(a[20], temp);
337:        temp = complex_ctr(
338:            0.55557023301960229,
339:            0.83146961230254524
340:        );
341:        a[21] = complex_mul(a[21], temp);
342:        temp = complex_ctr(
343:            0.38268343236508984,
344:            0.92387953251128674
345:        );
346:        a[22] = complex_mul(a[22], temp);
347:        temp = complex_ctr(
348:            0.19509032201612833,
349:            0.98078528040323043
350:        );
351:        a[23] = complex_mul(a[23], temp);
352:        temp = complex_ctr(
353:            6.123233995736766e-17,
354:            1.0
355:        );
356:        a[24] = complex_mul(a[24], temp);
357:        temp = complex_ctr(
358:            -0.19509032201612819,
359:            0.98078528040323043
360:        );
361:        a[25] = complex_mul(a[25], temp);
362:        temp = complex_ctr(
363:            -0.38268343236508973,
364:            0.92387953251128674
365:        );
366:        a[26] = complex_mul(a[26], temp);
367:        temp = complex_ctr(
368:            -0.55557023301960196,
369:            0.83146961230254546
370:        );
371:        a[27] = complex_mul(a[27], temp);
372:        temp = complex_ctr(
373:            -0.70710678118654746,
374:            0.70710678118654757
375:        );
376:        a[28] = complex_mul(a[28], temp);
377:        temp = complex_ctr(
378:            -0.83146961230254535,
379:            0.55557023301960218
380:        );
381:        a[29] = complex_mul(a[29], temp);
382:        temp = complex_ctr(
383:            -0.92387953251128674,
384:            0.38268343236508989
385:        );
386:        a[30] = complex_mul(a[30], temp);
387:        temp = complex_ctr(
388:            -0.98078528040323043,
389:            0.19509032201612861
390:        );
391:        a[31] = complex_mul(a[31], temp);
392:
393:    fftKernel16(a, direction);
394:    fftKernel16(a + 16, direction);
395:    bitreverse32(a);
396:}
397:
398:// Calculates input and output weights for the Bluestein's algorithm
399:WITHIN_KERNEL complex_t xweight(int dir_coeff, int pos)
400:{
401:    // The modulo of 2 * fft_size_real does not change the result,
402:    // but greatly improves the precision by keeping the argument of sin()/cos() small.
403:    return complex_exp(dir_coeff * 4.7936899621426287e-05 *
404:        ((pos * pos) % (2 * 65536)) );
405:}
406:
407:
408:
409:
410:
411:KERNEL void fft_global(SIGNATURE)
412:{
413:    VIRTUAL_SKIP_THREADS;
414:
415:    
416:
417:        LOCAL_MEM real_t lmem[1032];
418:        size_t lmem_store_index, lmem_load_index;
419:
420:    complex_t a[16];
421:
422:    int thread_id = virtual_local_id(0);
423:    int group_id = virtual_group_id(0);
424:
425:    int direction = _leaf_direction;
426:
427:    int norm_coeff = direction == 1 ? 1 : 1;
428:
429:
430:    int xform_global = group_id / 64;
431:    int group_in_xform = group_id % 64;
432:    int xform_local = thread_id / 8;
433:    int thread_in_xform = thread_id % 8;
434:
435:    int position_in_stride_in = thread_in_xform + group_in_xform * 8;
436:    int xform_number = xform_global * 1;
437:
438:
439:    {
440:        int stride_in_number = xform_local + 0;
441:        int position = position_in_stride_in + 512 * stride_in_number;
442:
443:
444:            a[0] = _LOAD_input(position + 65536 * xform_number);
445:
446:    }
447:    {
448:        int stride_in_number = xform_local + 8;
449:        int position = position_in_stride_in + 512 * stride_in_number;
450:
451:
452:            a[1] = _LOAD_input(position + 65536 * xform_number);
453:
454:    }
455:    {
456:        int stride_in_number = xform_local + 16;
457:        int position = position_in_stride_in + 512 * stride_in_number;
458:
459:
460:            a[2] = _LOAD_input(position + 65536 * xform_number);
461:
462:    }
463:    {
464:        int stride_in_number = xform_local + 24;
465:        int position = position_in_stride_in + 512 * stride_in_number;
466:
467:
468:            a[3] = _LOAD_input(position + 65536 * xform_number);
469:
470:    }
471:    {
472:        int stride_in_number = xform_local + 32;
473:        int position = position_in_stride_in + 512 * stride_in_number;
474:
475:
476:            a[4] = _LOAD_input(position + 65536 * xform_number);
477:
478:    }
479:    {
480:        int stride_in_number = xform_local + 40;
481:        int position = position_in_stride_in + 512 * stride_in_number;
482:
483:
484:            a[5] = _LOAD_input(position + 65536 * xform_number);
485:
486:    }
487:    {
488:        int stride_in_number = xform_local + 48;
489:        int position = position_in_stride_in + 512 * stride_in_number;
490:
491:
492:            a[6] = _LOAD_input(position + 65536 * xform_number);
493:
494:    }
495:    {
496:        int stride_in_number = xform_local + 56;
497:        int position = position_in_stride_in + 512 * stride_in_number;
498:
499:
500:            a[7] = _LOAD_input(position + 65536 * xform_number);
501:
502:    }
503:    {
504:        int stride_in_number = xform_local + 64;
505:        int position = position_in_stride_in + 512 * stride_in_number;
506:
507:
508:            a[8] = _LOAD_input(position + 65536 * xform_number);
509:
510:    }
511:    {
512:        int stride_in_number = xform_local + 72;
513:        int position = position_in_stride_in + 512 * stride_in_number;
514:
515:
516:            a[9] = _LOAD_input(position + 65536 * xform_number);
517:
518:    }
519:    {
520:        int stride_in_number = xform_local + 80;
521:        int position = position_in_stride_in + 512 * stride_in_number;
522:
523:
524:            a[10] = _LOAD_input(position + 65536 * xform_number);
525:
526:    }
527:    {
528:        int stride_in_number = xform_local + 88;
529:        int position = position_in_stride_in + 512 * stride_in_number;
530:
531:
532:            a[11] = _LOAD_input(position + 65536 * xform_number);
533:
534:    }
535:    {
536:        int stride_in_number = xform_local + 96;
537:        int position = position_in_stride_in + 512 * stride_in_number;
538:
539:
540:            a[12] = _LOAD_input(position + 65536 * xform_number);
541:
542:    }
543:    {
544:        int stride_in_number = xform_local + 104;
545:        int position = position_in_stride_in + 512 * stride_in_number;
546:
547:
548:            a[13] = _LOAD_input(position + 65536 * xform_number);
549:
550:    }
551:    {
552:        int stride_in_number = xform_local + 112;
553:        int position = position_in_stride_in + 512 * stride_in_number;
554:
555:
556:            a[14] = _LOAD_input(position + 65536 * xform_number);
557:
558:    }
559:    {
560:        int stride_in_number = xform_local + 120;
561:        int position = position_in_stride_in + 512 * stride_in_number;
562:
563:
564:            a[15] = _LOAD_input(position + 65536 * xform_number);
565:
566:    }
567:
568:    fftKernel16(a, direction);
569:
570:        {
571:            real_t ang;
572:            complex_t w;
573:
574:            ang = 0.04908738521234052 * xform_local * direction;
575:            w = complex_exp(ang);
576:            a[1] = complex_mul(a[1], w);
577:            ang = 0.09817477042468103 * xform_local * direction;
578:            w = complex_exp(ang);
579:            a[2] = complex_mul(a[2], w);
580:            ang = 0.14726215563702155 * xform_local * direction;
581:            w = complex_exp(ang);
582:            a[3] = complex_mul(a[3], w);
583:            ang = 0.19634954084936207 * xform_local * direction;
584:            w = complex_exp(ang);
585:            a[4] = complex_mul(a[4], w);
586:            ang = 0.2454369260617026 * xform_local * direction;
587:            w = complex_exp(ang);
588:            a[5] = complex_mul(a[5], w);
589:            ang = 0.2945243112740431 * xform_local * direction;
590:            w = complex_exp(ang);
591:            a[6] = complex_mul(a[6], w);
592:            ang = 0.3436116964863836 * xform_local * direction;
593:            w = complex_exp(ang);
594:            a[7] = complex_mul(a[7], w);
595:            ang = 0.39269908169872414 * xform_local * direction;
596:            w = complex_exp(ang);
597:            a[8] = complex_mul(a[8], w);
598:            ang = 0.44178646691106466 * xform_local * direction;
599:            w = complex_exp(ang);
600:            a[9] = complex_mul(a[9], w);
601:            ang = 0.4908738521234052 * xform_local * direction;
602:            w = complex_exp(ang);
603:            a[10] = complex_mul(a[10], w);
604:            ang = 0.5399612373357456 * xform_local * direction;
605:            w = complex_exp(ang);
606:            a[11] = complex_mul(a[11], w);
607:            ang = 0.5890486225480862 * xform_local * direction;
608:            w = complex_exp(ang);
609:            a[12] = complex_mul(a[12], w);
610:            ang = 0.6381360077604268 * xform_local * direction;
611:            w = complex_exp(ang);
612:            a[13] = complex_mul(a[13], w);
613:            ang = 0.6872233929727672 * xform_local * direction;
614:            w = complex_exp(ang);
615:            a[14] = complex_mul(a[14], w);
616:            ang = 0.7363107781851077 * xform_local * direction;
617:            w = complex_exp(ang);
618:            a[15] = complex_mul(a[15], w);
619:        }
620:
621:        lmem_store_index = thread_id;
622:        lmem_load_index = mad24(xform_local, 128, thread_in_xform);
623:
624:                lmem[lmem_store_index + 0] = a[0].x;
625:                lmem[lmem_store_index + 64] = a[1].x;
626:                lmem[lmem_store_index + 128] = a[2].x;
627:                lmem[lmem_store_index + 192] = a[3].x;
628:                lmem[lmem_store_index + 256] = a[4].x;
629:                lmem[lmem_store_index + 320] = a[5].x;
630:                lmem[lmem_store_index + 384] = a[6].x;
631:                lmem[lmem_store_index + 448] = a[7].x;
632:                lmem[lmem_store_index + 512] = a[8].x;
633:                lmem[lmem_store_index + 576] = a[9].x;
634:                lmem[lmem_store_index + 640] = a[10].x;
635:                lmem[lmem_store_index + 704] = a[11].x;
636:                lmem[lmem_store_index + 768] = a[12].x;
637:                lmem[lmem_store_index + 832] = a[13].x;
638:                lmem[lmem_store_index + 896] = a[14].x;
639:                lmem[lmem_store_index + 960] = a[15].x;
640:            LOCAL_BARRIER;
641:
642:                    a[0].x =
643:                        lmem[lmem_load_index + 0];
644:                    a[1].x =
645:                        lmem[lmem_load_index + 8];
646:                    a[2].x =
647:                        lmem[lmem_load_index + 16];
648:                    a[3].x =
649:                        lmem[lmem_load_index + 24];
650:                    a[4].x =
651:                        lmem[lmem_load_index + 32];
652:                    a[5].x =
653:                        lmem[lmem_load_index + 40];
654:                    a[6].x =
655:                        lmem[lmem_load_index + 48];
656:                    a[7].x =
657:                        lmem[lmem_load_index + 56];
658:                    a[8].x =
659:                        lmem[lmem_load_index + 64];
660:                    a[9].x =
661:                        lmem[lmem_load_index + 72];
662:                    a[10].x =
663:                        lmem[lmem_load_index + 80];
664:                    a[11].x =
665:                        lmem[lmem_load_index + 88];
666:                    a[12].x =
667:                        lmem[lmem_load_index + 96];
668:                    a[13].x =
669:                        lmem[lmem_load_index + 104];
670:                    a[14].x =
671:                        lmem[lmem_load_index + 112];
672:                    a[15].x =
673:                        lmem[lmem_load_index + 120];
674:            LOCAL_BARRIER;
675:                lmem[lmem_store_index + 0] = a[0].y;
676:                lmem[lmem_store_index + 64] = a[1].y;
677:                lmem[lmem_store_index + 128] = a[2].y;
678:                lmem[lmem_store_index + 192] = a[3].y;
679:                lmem[lmem_store_index + 256] = a[4].y;
680:                lmem[lmem_store_index + 320] = a[5].y;
681:                lmem[lmem_store_index + 384] = a[6].y;
682:                lmem[lmem_store_index + 448] = a[7].y;
683:                lmem[lmem_store_index + 512] = a[8].y;
684:                lmem[lmem_store_index + 576] = a[9].y;
685:                lmem[lmem_store_index + 640] = a[10].y;
686:                lmem[lmem_store_index + 704] = a[11].y;
687:                lmem[lmem_store_index + 768] = a[12].y;
688:                lmem[lmem_store_index + 832] = a[13].y;
689:                lmem[lmem_store_index + 896] = a[14].y;
690:                lmem[lmem_store_index + 960] = a[15].y;
691:            LOCAL_BARRIER;
692:
693:                    a[0].y =
694:                        lmem[lmem_load_index + 0];
695:                    a[1].y =
696:                        lmem[lmem_load_index + 8];
697:                    a[2].y =
698:                        lmem[lmem_load_index + 16];
699:                    a[3].y =
700:                        lmem[lmem_load_index + 24];
701:                    a[4].y =
702:                        lmem[lmem_load_index + 32];
703:                    a[5].y =
704:                        lmem[lmem_load_index + 40];
705:                    a[6].y =
706:                        lmem[lmem_load_index + 48];
707:                    a[7].y =
708:                        lmem[lmem_load_index + 56];
709:                    a[8].y =
710:                        lmem[lmem_load_index + 64];
711:                    a[9].y =
712:                        lmem[lmem_load_index + 72];
713:                    a[10].y =
714:                        lmem[lmem_load_index + 80];
715:                    a[11].y =
716:                        lmem[lmem_load_index + 88];
717:                    a[12].y =
718:                        lmem[lmem_load_index + 96];
719:                    a[13].y =
720:                        lmem[lmem_load_index + 104];
721:                    a[14].y =
722:                        lmem[lmem_load_index + 112];
723:                    a[15].y =
724:                        lmem[lmem_load_index + 120];
725:            LOCAL_BARRIER;
726:
727:            fftKernel8(a + 0, direction);
728:            fftKernel8(a + 8, direction);
729:
730:    {
731:        real_t ang1, ang;
732:        complex_t w;
733:
734:        int l = (group_in_xform * 8 + thread_in_xform) / 1;
735:        int k = xform_local * 2;
736:        ang1 = 9.587379924285257e-05 * l * direction;
737:            ang = ang1 * (k + 0);
738:            w = complex_exp(ang);
739:            a[0] = complex_mul(a[0], w);
740:            ang = ang1 * (k + 16);
741:            w = complex_exp(ang);
742:            a[1] = complex_mul(a[1], w);
743:            ang = ang1 * (k + 32);
744:            w = complex_exp(ang);
745:            a[2] = complex_mul(a[2], w);
746:            ang = ang1 * (k + 48);
747:            w = complex_exp(ang);
748:            a[3] = complex_mul(a[3], w);
749:            ang = ang1 * (k + 64);
750:            w = complex_exp(ang);
751:            a[4] = complex_mul(a[4], w);
752:            ang = ang1 * (k + 80);
753:            w = complex_exp(ang);
754:            a[5] = complex_mul(a[5], w);
755:            ang = ang1 * (k + 96);
756:            w = complex_exp(ang);
757:            a[6] = complex_mul(a[6], w);
758:            ang = ang1 * (k + 112);
759:            w = complex_exp(ang);
760:            a[7] = complex_mul(a[7], w);
761:            ang = ang1 * (k + 1);
762:            w = complex_exp(ang);
763:            a[8] = complex_mul(a[8], w);
764:            ang = ang1 * (k + 17);
765:            w = complex_exp(ang);
766:            a[9] = complex_mul(a[9], w);
767:            ang = ang1 * (k + 33);
768:            w = complex_exp(ang);
769:            a[10] = complex_mul(a[10], w);
770:            ang = ang1 * (k + 49);
771:            w = complex_exp(ang);
772:            a[11] = complex_mul(a[11], w);
773:            ang = ang1 * (k + 65);
774:            w = complex_exp(ang);
775:            a[12] = complex_mul(a[12], w);
776:            ang = ang1 * (k + 81);
777:            w = complex_exp(ang);
778:            a[13] = complex_mul(a[13], w);
779:            ang = ang1 * (k + 97);
780:            w = complex_exp(ang);
781:            a[14] = complex_mul(a[14], w);
782:            ang = ang1 * (k + 113);
783:            w = complex_exp(ang);
784:            a[15] = complex_mul(a[15], w);
785:    }
786:
787:        lmem_store_index = mad24(thread_in_xform, 129, xform_local * 2);
788:        lmem_load_index = mad24(thread_id / 128, 129, thread_id % 128);
789:
790:                    lmem[lmem_store_index + 0] = a[0].x;
791:                    lmem[lmem_store_index + 16] = a[1].x;
792:                    lmem[lmem_store_index + 32] = a[2].x;
793:                    lmem[lmem_store_index + 48] = a[3].x;
794:                    lmem[lmem_store_index + 64] = a[4].x;
795:                    lmem[lmem_store_index + 80] = a[5].x;
796:                    lmem[lmem_store_index + 96] = a[6].x;
797:                    lmem[lmem_store_index + 112] = a[7].x;
798:                    lmem[lmem_store_index + 1] = a[8].x;
799:                    lmem[lmem_store_index + 17] = a[9].x;
800:                    lmem[lmem_store_index + 33] = a[10].x;
801:                    lmem[lmem_store_index + 49] = a[11].x;
802:                    lmem[lmem_store_index + 65] = a[12].x;
803:                    lmem[lmem_store_index + 81] = a[13].x;
804:                    lmem[lmem_store_index + 97] = a[14].x;
805:                    lmem[lmem_store_index + 113] = a[15].x;
806:            LOCAL_BARRIER;
807:
808:                
809:                        a[0].x = lmem[lmem_load_index + 0];
810:                        a[1].x = lmem[lmem_load_index + 64];
811:                        a[2].x = lmem[lmem_load_index + 129];
812:                        a[3].x = lmem[lmem_load_index + 193];
813:                        a[4].x = lmem[lmem_load_index + 258];
814:                        a[5].x = lmem[lmem_load_index + 322];
815:                        a[6].x = lmem[lmem_load_index + 387];
816:                        a[7].x = lmem[lmem_load_index + 451];
817:                        a[8].x = lmem[lmem_load_index + 516];
818:                        a[9].x = lmem[lmem_load_index + 580];
819:                        a[10].x = lmem[lmem_load_index + 645];
820:                        a[11].x = lmem[lmem_load_index + 709];
821:                        a[12].x = lmem[lmem_load_index + 774];
822:                        a[13].x = lmem[lmem_load_index + 838];
823:                        a[14].x = lmem[lmem_load_index + 903];
824:                        a[15].x = lmem[lmem_load_index + 967];
825:            LOCAL_BARRIER;
826:                    lmem[lmem_store_index + 0] = a[0].y;
827:                    lmem[lmem_store_index + 16] = a[1].y;
828:                    lmem[lmem_store_index + 32] = a[2].y;
829:                    lmem[lmem_store_index + 48] = a[3].y;
830:                    lmem[lmem_store_index + 64] = a[4].y;
831:                    lmem[lmem_store_index + 80] = a[5].y;
832:                    lmem[lmem_store_index + 96] = a[6].y;
833:                    lmem[lmem_store_index + 112] = a[7].y;
834:                    lmem[lmem_store_index + 1] = a[8].y;
835:                    lmem[lmem_store_index + 17] = a[9].y;
836:                    lmem[lmem_store_index + 33] = a[10].y;
837:                    lmem[lmem_store_index + 49] = a[11].y;
838:                    lmem[lmem_store_index + 65] = a[12].y;
839:                    lmem[lmem_store_index + 81] = a[13].y;
840:                    lmem[lmem_store_index + 97] = a[14].y;
841:                    lmem[lmem_store_index + 113] = a[15].y;
842:            LOCAL_BARRIER;
843:
844:                
845:                        a[0].y = lmem[lmem_load_index + 0];
846:                        a[1].y = lmem[lmem_load_index + 64];
847:                        a[2].y = lmem[lmem_load_index + 129];
848:                        a[3].y = lmem[lmem_load_index + 193];
849:                        a[4].y = lmem[lmem_load_index + 258];
850:                        a[5].y = lmem[lmem_load_index + 322];
851:                        a[6].y = lmem[lmem_load_index + 387];
852:                        a[7].y = lmem[lmem_load_index + 451];
853:                        a[8].y = lmem[lmem_load_index + 516];
854:                        a[9].y = lmem[lmem_load_index + 580];
855:                        a[10].y = lmem[lmem_load_index + 645];
856:                        a[11].y = lmem[lmem_load_index + 709];
857:                        a[12].y = lmem[lmem_load_index + 774];
858:                        a[13].y = lmem[lmem_load_index + 838];
859:                        a[14].y = lmem[lmem_load_index + 903];
860:                        a[15].y = lmem[lmem_load_index + 967];
861:            LOCAL_BARRIER;
862:
863:        int position_in_stride_out = (group_in_xform * 8) % 1;
864:        int stride_out_number = (group_in_xform * 8) / 1;
865:        int idx = stride_out_number * 128 + position_in_stride_out + thread_id +
866:            65536 * xform_number;
867:
868:        {
869:            int position = stride_out_number * 128 + 0 +
870:                position_in_stride_out + thread_id;
871:                _STORE__temp0(position + 65536 * xform_number,
872:                    complex_div_scalar(a[0], norm_coeff));
873:        }
874:        {
875:            int position = stride_out_number * 128 + 64 +
876:                position_in_stride_out + thread_id;
877:                _STORE__temp0(position + 65536 * xform_number,
878:                    complex_div_scalar(a[1], norm_coeff));
879:        }
880:        {
881:            int position = stride_out_number * 128 + 128 +
882:                position_in_stride_out + thread_id;
883:                _STORE__temp0(position + 65536 * xform_number,
884:                    complex_div_scalar(a[2], norm_coeff));
885:        }
886:        {
887:            int position = stride_out_number * 128 + 192 +
888:                position_in_stride_out + thread_id;
889:                _STORE__temp0(position + 65536 * xform_number,
890:                    complex_div_scalar(a[3], norm_coeff));
891:        }
892:        {
893:            int position = stride_out_number * 128 + 256 +
894:                position_in_stride_out + thread_id;
895:                _STORE__temp0(position + 65536 * xform_number,
896:                    complex_div_scalar(a[4], norm_coeff));
897:        }
898:        {
899:            int position = stride_out_number * 128 + 320 +
900:                position_in_stride_out + thread_id;
901:                _STORE__temp0(position + 65536 * xform_number,
902:                    complex_div_scalar(a[5], norm_coeff));
903:        }
904:        {
905:            int position = stride_out_number * 128 + 384 +
906:                position_in_stride_out + thread_id;
907:                _STORE__temp0(position + 65536 * xform_number,
908:                    complex_div_scalar(a[6], norm_coeff));
909:        }
910:        {
911:            int position = stride_out_number * 128 + 448 +
912:                position_in_stride_out + thread_id;
913:                _STORE__temp0(position + 65536 * xform_number,
914:                    complex_div_scalar(a[7], norm_coeff));
915:        }
916:        {
917:            int position = stride_out_number * 128 + 512 +
918:                position_in_stride_out + thread_id;
919:                _STORE__temp0(position + 65536 * xform_number,
920:                    complex_div_scalar(a[8], norm_coeff));
921:        }
922:        {
923:            int position = stride_out_number * 128 + 576 +
924:                position_in_stride_out + thread_id;
925:                _STORE__temp0(position + 65536 * xform_number,
926:                    complex_div_scalar(a[9], norm_coeff));
927:        }
928:        {
929:            int position = stride_out_number * 128 + 640 +
930:                position_in_stride_out + thread_id;
931:                _STORE__temp0(position + 65536 * xform_number,
932:                    complex_div_scalar(a[10], norm_coeff));
933:        }
934:        {
935:            int position = stride_out_number * 128 + 704 +
936:                position_in_stride_out + thread_id;
937:                _STORE__temp0(position + 65536 * xform_number,
938:                    complex_div_scalar(a[11], norm_coeff));
939:        }
940:        {
941:            int position = stride_out_number * 128 + 768 +
942:                position_in_stride_out + thread_id;
943:                _STORE__temp0(position + 65536 * xform_number,
944:                    complex_div_scalar(a[12], norm_coeff));
945:        }
946:        {
947:            int position = stride_out_number * 128 + 832 +
948:                position_in_stride_out + thread_id;
949:                _STORE__temp0(position + 65536 * xform_number,
950:                    complex_div_scalar(a[13], norm_coeff));
951:        }
952:        {
953:            int position = stride_out_number * 128 + 896 +
954:                position_in_stride_out + thread_id;
955:                _STORE__temp0(position + 65536 * xform_number,
956:                    complex_div_scalar(a[14], norm_coeff));
957:        }
958:        {
959:            int position = stride_out_number * 128 + 960 +
960:                position_in_stride_out + thread_id;
961:                _STORE__temp0(position + 65536 * xform_number,
962:                    complex_div_scalar(a[15], norm_coeff));
963:        }
964:}
965:
966:
Traceback (most recent call last):
  File "/home/tnorth/exp_sim/confparser.py", line 157, in <module>
    exp.run(iter)
  File "/home/tnorth/exp_sim/experiment.py", line 128, in run
    previous_device.run()
  File "/home/tnorth/exp_sim/sparam.py", line 73, in add
    result = method(self, *args, **kw)
  File "/home/tnorth/exp_sim/sparam.py", line 82, in timed
    result = method(self, *args, **kw)
  File "/home/tnorth/exp_sim/fiber.py", line 95, in run
    self.propag_functions[with_gpu][coupled][self.propag_func](self.EtIn['fiber_in'])
  File "/home/tnorth/exp_sim/fiber.py", line 898, in RK4IP_method_GPU
    NLOpsAf.prepare_for(Af__gpu, A_gpu, 1)
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/computation.py", line 207, in prepare_for
    self._operations = self._construct_operations(self._basis, self._ctx.device_params)
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/fft.py", line 554, in _construct_operations
    inplace=([(mem_out, mem_in)] if kernel.inplace_possible else None))
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/operation.py", line 99, in add_kernel
    op.prepare(self._ctx, self._tr_tree)
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/core/operation.py", line 195, in prepare
    self.global_size, local_size=self.local_size)
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 153, in compile_static
    local_size=local_size, render_kwds=render_kwds)
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 264, in __init__
    stub_module = ctx._compile(str(prelude + stub_vsize_funcs + src))
  File "/usr/lib/python2.7/site-packages/tigger-0.2.0dev_b46bc14-py2.7.egg/tigger/cluda/ocl.py", line 140, in _compile
    module = cl.Program(self._context, src).build(options=options)
  File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/__init__.py", line 124, in build
    cache_dir=cache_dir)
  File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/cache.py", line 460, in create_built_program_from_source_cached
    ctx, src, options, devices, cache_dir)
  File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/cache.py", line 384, in _create_built_program_from_source_cached
    prg.build(options, [devices[i] for i in to_be_built_indices])
  File "/usr/lib64/python2.7/site-packages/pyopencl-2011.2-py2.7-linux-x86_64.egg/pyopencl/__init__.py", line 377, in program_build
    raise err
pyopencl.RuntimeError: clBuildProgram failed: build program failure - 

Build on <pyopencl.Device 'GeForce GTX 550 Ti' on 'NVIDIA CUDA' at 0x2cf9020>:

:135:13: error: call to 'native_cos' is ambiguous
    res.x = native_cos(ang);
            ^~~~~~~~~~
<built-in>:870:24: note: candidate function
float __OVERLOADABLE__ native_cos(float);
                       ^
<built-in>:871:25: note: candidate function
float2 __OVERLOADABLE__ native_cos(float2); 
                        ^
<built-in>:873:25: note: candidate function
float3 __OVERLOADABLE__ native_cos(float3); 
                        ^
<built-in>:875:25: note: candidate function
float4 __OVERLOADABLE__ native_cos(float4); 
                        ^
<built-in>:876:25: note: candidate function
float8 __OVERLOADABLE__ native_cos(float8); 
                        ^
<built-in>:877:26: note: candidate function
float16 __OVERLOADABLE__ native_cos(float16); 
                         ^
:136:13: error: call to 'native_sin' is ambiguous
    res.y = native_sin(ang);
            ^~~~~~~~~~
<built-in>:950:24: note: candidate function
float __OVERLOADABLE__ native_sin(float);
                       ^
<built-in>:951:25: note: candidate function
float2 __OVERLOADABLE__ native_sin(float2); 
                        ^
<built-in>:953:25: note: candidate function
float3 __OVERLOADABLE__ native_sin(float3); 
                        ^
<built-in>:955:25: note: candidate function
float4 __OVERLOADABLE__ native_sin(float4); 
                        ^
<built-in>:956:25: note: candidate function
float8 __OVERLOADABLE__ native_sin(float8); 
                        ^
<built-in>:957:26: note: candidate function
float16 __OVERLOADABLE__ native_sin(float16); 
fjarri commented 11 years ago

That's why I should run tests after bugfixes. It seems that native_sin() and native_cos() do not work with double, only float.

tnorth commented 11 years ago

Thank you ! (I didn't know that)