rs_cl.c revision 9cbc99ba45126a6a30ba13fc6d4e75e51ca14ea7
1#include "rs_core.rsh"
2
3extern float2 __attribute__((overloadable)) convert_float2(int2 c);
4extern float3 __attribute__((overloadable)) convert_float3(int3 c);
5extern float4 __attribute__((overloadable)) convert_float4(int4 c);
6
7extern int2 __attribute__((overloadable)) convert_int2(float2 c);
8extern int3 __attribute__((overloadable)) convert_int3(float3 c);
9extern int4 __attribute__((overloadable)) convert_int4(float4 c);
10
11
12extern float __attribute__((overloadable)) fmin(float v, float v2);
13extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
14extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
15extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
16
17extern float __attribute__((overloadable)) fmax(float v, float v2);
18extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
19extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
20extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
21
22// Float ops, 6.11.2
23
24#define FN_FUNC_FN(fnc)                                         \
25extern float2 __attribute__((overloadable)) fnc(float2 v) { \
26    float2 r;                                                   \
27    r.x = fnc(v.x);                                             \
28    r.y = fnc(v.y);                                             \
29    return r;                                                   \
30}                                                               \
31extern float3 __attribute__((overloadable)) fnc(float3 v) { \
32    float3 r;                                                   \
33    r.x = fnc(v.x);                                             \
34    r.y = fnc(v.y);                                             \
35    r.z = fnc(v.z);                                             \
36    return r;                                                   \
37}                                                               \
38extern float4 __attribute__((overloadable)) fnc(float4 v) { \
39    float4 r;                                                   \
40    r.x = fnc(v.x);                                             \
41    r.y = fnc(v.y);                                             \
42    r.z = fnc(v.z);                                             \
43    r.w = fnc(v.w);                                             \
44    return r;                                                   \
45}
46
47#define IN_FUNC_FN(fnc)                                         \
48extern int2 __attribute__((overloadable)) fnc(float2 v) {   \
49    int2 r;                                                     \
50    r.x = fnc(v.x);                                             \
51    r.y = fnc(v.y);                                             \
52    return r;                                                   \
53}                                                               \
54extern int3 __attribute__((overloadable)) fnc(float3 v) {   \
55    int3 r;                                                     \
56    r.x = fnc(v.x);                                             \
57    r.y = fnc(v.y);                                             \
58    r.z = fnc(v.z);                                             \
59    return r;                                                   \
60}                                                               \
61extern int4 __attribute__((overloadable)) fnc(float4 v) {   \
62    int4 r;                                                     \
63    r.x = fnc(v.x);                                             \
64    r.y = fnc(v.y);                                             \
65    r.z = fnc(v.z);                                             \
66    r.w = fnc(v.w);                                             \
67    return r;                                                   \
68}
69
70#define FN_FUNC_FN_FN(fnc)                                                  \
71extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
72    float2 r;                                                               \
73    r.x = fnc(v1.x, v2.x);                                                  \
74    r.y = fnc(v1.y, v2.y);                                                  \
75    return r;                                                               \
76}                                                                           \
77extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
78    float3 r;                                                               \
79    r.x = fnc(v1.x, v2.x);                                                  \
80    r.y = fnc(v1.y, v2.y);                                                  \
81    r.z = fnc(v1.z, v2.z);                                                  \
82    return r;                                                               \
83}                                                                           \
84extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
85    float4 r;                                                               \
86    r.x = fnc(v1.x, v2.x);                                                  \
87    r.y = fnc(v1.y, v2.y);                                                  \
88    r.z = fnc(v1.z, v2.z);                                                  \
89    r.w = fnc(v1.w, v2.w);                                                  \
90    return r;                                                               \
91}
92
93#define FN_FUNC_FN_F(fnc)                                                   \
94extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) {  \
95    float2 r;                                                               \
96    r.x = fnc(v1.x, v2);                                                    \
97    r.y = fnc(v1.y, v2);                                                    \
98    return r;                                                               \
99}                                                                           \
100extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) {  \
101    float3 r;                                                               \
102    r.x = fnc(v1.x, v2);                                                    \
103    r.y = fnc(v1.y, v2);                                                    \
104    r.z = fnc(v1.z, v2);                                                    \
105    return r;                                                               \
106}                                                                           \
107extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) {  \
108    float4 r;                                                               \
109    r.x = fnc(v1.x, v2);                                                    \
110    r.y = fnc(v1.y, v2);                                                    \
111    r.z = fnc(v1.z, v2);                                                    \
112    r.w = fnc(v1.w, v2);                                                    \
113    return r;                                                               \
114}
115
116#define FN_FUNC_FN_IN(fnc)                                                  \
117extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) {   \
118    float2 r;                                                               \
119    r.x = fnc(v1.x, v2.x);                                                  \
120    r.y = fnc(v1.y, v2.y);                                                  \
121    return r;                                                               \
122}                                                                           \
123extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) {   \
124    float3 r;                                                               \
125    r.x = fnc(v1.x, v2.x);                                                  \
126    r.y = fnc(v1.y, v2.y);                                                  \
127    r.z = fnc(v1.z, v2.z);                                                  \
128    return r;                                                               \
129}                                                                           \
130extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) {   \
131    float4 r;                                                               \
132    r.x = fnc(v1.x, v2.x);                                                  \
133    r.y = fnc(v1.y, v2.y);                                                  \
134    r.z = fnc(v1.z, v2.z);                                                  \
135    r.w = fnc(v1.w, v2.w);                                                  \
136    return r;                                                               \
137}
138
139#define FN_FUNC_FN_I(fnc)                                                   \
140extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) {    \
141    float2 r;                                                               \
142    r.x = fnc(v1.x, v2);                                                    \
143    r.y = fnc(v1.y, v2);                                                    \
144    return r;                                                               \
145}                                                                           \
146extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) {    \
147    float3 r;                                                               \
148    r.x = fnc(v1.x, v2);                                                    \
149    r.y = fnc(v1.y, v2);                                                    \
150    r.z = fnc(v1.z, v2);                                                    \
151    return r;                                                               \
152}                                                                           \
153extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) {    \
154    float4 r;                                                               \
155    r.x = fnc(v1.x, v2);                                                    \
156    r.y = fnc(v1.y, v2);                                                    \
157    r.z = fnc(v1.z, v2);                                                    \
158    r.w = fnc(v1.w, v2);                                                    \
159    return r;                                                               \
160}
161
162#define FN_FUNC_FN_PFN(fnc)                     \
163extern float2 __attribute__((overloadable)) \
164        fnc(float2 v1, float2 *v2) {            \
165    float2 r;                                   \
166    float t[2];                                 \
167    r.x = fnc(v1.x, &t[0]);                     \
168    r.y = fnc(v1.y, &t[1]);                     \
169    v2->x = t[0];                               \
170    v2->y = t[1];                               \
171    return r;                                   \
172}                                               \
173extern float3 __attribute__((overloadable)) \
174        fnc(float3 v1, float3 *v2) {            \
175    float3 r;                                   \
176    float t[3];                                 \
177    r.x = fnc(v1.x, &t[0]);                     \
178    r.y = fnc(v1.y, &t[1]);                     \
179    r.z = fnc(v1.z, &t[2]);                     \
180    v2->x = t[0];                               \
181    v2->y = t[1];                               \
182    v2->z = t[2];                               \
183    return r;                                   \
184}                                               \
185extern float4 __attribute__((overloadable)) \
186        fnc(float4 v1, float4 *v2) {            \
187    float4 r;                                   \
188    float t[4];                                 \
189    r.x = fnc(v1.x, &t[0]);                     \
190    r.y = fnc(v1.y, &t[1]);                     \
191    r.z = fnc(v1.z, &t[2]);                     \
192    r.w = fnc(v1.w, &t[3]);                     \
193    v2->x = t[0];                               \
194    v2->y = t[1];                               \
195    v2->z = t[2];                               \
196    v2->w = t[3];                               \
197    return r;                                   \
198}
199
200#define FN_FUNC_FN_PIN(fnc)                                                 \
201extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) {  \
202    float2 r;                                                               \
203    int t[2];                                                               \
204    r.x = fnc(v1.x, &t[0]);                                                 \
205    r.y = fnc(v1.y, &t[1]);                                                 \
206    v2->x = t[0];                                                           \
207    v2->y = t[1];                                                           \
208    return r;                                                               \
209}                                                                           \
210extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) {  \
211    float3 r;                                                               \
212    int t[3];                                                               \
213    r.x = fnc(v1.x, &t[0]);                                                 \
214    r.y = fnc(v1.y, &t[1]);                                                 \
215    r.z = fnc(v1.z, &t[2]);                                                 \
216    v2->x = t[0];                                                           \
217    v2->y = t[1];                                                           \
218    v2->z = t[2];                                                           \
219    return r;                                                               \
220}                                                                           \
221extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) {  \
222    float4 r;                                                               \
223    int t[4];                                                               \
224    r.x = fnc(v1.x, &t[0]);                                                 \
225    r.y = fnc(v1.y, &t[1]);                                                 \
226    r.z = fnc(v1.z, &t[2]);                                                 \
227    r.w = fnc(v1.w, &t[3]);                                                 \
228    v2->x = t[0];                                                           \
229    v2->y = t[1];                                                           \
230    v2->z = t[2];                                                           \
231    v2->w = t[3];                                                           \
232    return r;                                                               \
233}
234
235#define FN_FUNC_FN_FN_FN(fnc)                   \
236extern float2 __attribute__((overloadable)) \
237        fnc(float2 v1, float2 v2, float2 v3) {  \
238    float2 r;                                   \
239    r.x = fnc(v1.x, v2.x, v3.x);                \
240    r.y = fnc(v1.y, v2.y, v3.y);                \
241    return r;                                   \
242}                                               \
243extern float3 __attribute__((overloadable)) \
244        fnc(float3 v1, float3 v2, float3 v3) {  \
245    float3 r;                                   \
246    r.x = fnc(v1.x, v2.x, v3.x);                \
247    r.y = fnc(v1.y, v2.y, v3.y);                \
248    r.z = fnc(v1.z, v2.z, v3.z);                \
249    return r;                                   \
250}                                               \
251extern float4 __attribute__((overloadable)) \
252        fnc(float4 v1, float4 v2, float4 v3) {  \
253    float4 r;                                   \
254    r.x = fnc(v1.x, v2.x, v3.x);                \
255    r.y = fnc(v1.y, v2.y, v3.y);                \
256    r.z = fnc(v1.z, v2.z, v3.z);                \
257    r.w = fnc(v1.w, v2.w, v3.w);                \
258    return r;                                   \
259}
260
261#define FN_FUNC_FN_FN_PIN(fnc)                  \
262extern float2 __attribute__((overloadable)) \
263        fnc(float2 v1, float2 v2, int2 *v3) {   \
264    float2 r;                                   \
265    int t[2];                                   \
266    r.x = fnc(v1.x, v2.x, &t[0]);               \
267    r.y = fnc(v1.y, v2.y, &t[1]);               \
268    v3->x = t[0];                               \
269    v3->y = t[1];                               \
270    return r;                                   \
271}                                               \
272extern float3 __attribute__((overloadable)) \
273        fnc(float3 v1, float3 v2, int3 *v3) {   \
274    float3 r;                                   \
275    int t[3];                                   \
276    r.x = fnc(v1.x, v2.x, &t[0]);               \
277    r.y = fnc(v1.y, v2.y, &t[1]);               \
278    r.z = fnc(v1.z, v2.z, &t[2]);               \
279    v3->x = t[0];                               \
280    v3->y = t[1];                               \
281    v3->z = t[2];                               \
282    return r;                                   \
283}                                               \
284extern float4 __attribute__((overloadable)) \
285        fnc(float4 v1, float4 v2, int4 *v3) {   \
286    float4 r;                                   \
287    int t[4];                                   \
288    r.x = fnc(v1.x, v2.x, &t[0]);               \
289    r.y = fnc(v1.y, v2.y, &t[1]);               \
290    r.z = fnc(v1.z, v2.z, &t[2]);               \
291    r.w = fnc(v1.w, v2.w, &t[3]);               \
292    v3->x = t[0];                               \
293    v3->y = t[1];                               \
294    v3->z = t[2];                               \
295    v3->w = t[3];                               \
296    return r;                                   \
297}
298
299static const int iposinf = 0x7f800000;
300static const int ineginf = 0xff800000;
301
302static const float posinf() {
303    float f = *((float*)&iposinf);
304    return f;
305}
306
307static const float neginf() {
308    float f = *((float*)&ineginf);
309    return f;
310}
311
312static bool isinf(float f) {
313    int i = *((int*)(void*)&f);
314    return (i == iposinf) || (i == ineginf);
315}
316
317static bool isnan(float f) {
318    int i = *((int*)(void*)&f);
319    return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
320}
321
322static bool isposzero(float f) {
323    int i = *((int*)(void*)&f);
324    return (i == 0x00000000);
325}
326
327static bool isnegzero(float f) {
328    int i = *((int*)(void*)&f);
329    return (i == 0x80000000);
330}
331
332static bool iszero(float f) {
333    return isposzero(f) || isnegzero(f);
334}
335
336
337extern float __attribute__((overloadable)) acos(float);
338FN_FUNC_FN(acos)
339
340extern float __attribute__((overloadable)) acosh(float);
341FN_FUNC_FN(acosh)
342
343
344extern float __attribute__((overloadable)) acospi(float v) {
345    return acos(v) / M_PI;
346}
347FN_FUNC_FN(acospi)
348
349extern float __attribute__((overloadable)) asin(float);
350FN_FUNC_FN(asin)
351
352extern float __attribute__((overloadable)) asinh(float);
353FN_FUNC_FN(asinh)
354
355extern float __attribute__((overloadable)) asinpi(float v) {
356    return asin(v) / M_PI;
357}
358FN_FUNC_FN(asinpi)
359
360extern float __attribute__((overloadable)) atan(float);
361FN_FUNC_FN(atan)
362
363extern float __attribute__((overloadable)) atan2(float, float);
364FN_FUNC_FN_FN(atan2)
365
366extern float __attribute__((overloadable)) atanh(float);
367FN_FUNC_FN(atanh)
368
369extern float __attribute__((overloadable)) atanpi(float v) {
370    return atan(v) / M_PI;
371}
372FN_FUNC_FN(atanpi)
373
374
375extern float __attribute__((overloadable)) atan2pi(float y, float x) {
376    return atan2(y, x) / M_PI;
377}
378FN_FUNC_FN_FN(atan2pi)
379
380extern float __attribute__((overloadable)) cbrt(float);
381FN_FUNC_FN(cbrt)
382
383extern float __attribute__((overloadable)) ceil(float);
384FN_FUNC_FN(ceil)
385
386extern float __attribute__((overloadable)) copysign(float, float);
387FN_FUNC_FN_FN(copysign)
388
389extern float __attribute__((overloadable)) cos(float);
390FN_FUNC_FN(cos)
391
392extern float __attribute__((overloadable)) cosh(float);
393FN_FUNC_FN(cosh)
394
395extern float __attribute__((overloadable)) cospi(float v) {
396    return cos(v * M_PI);
397}
398FN_FUNC_FN(cospi)
399
400extern float __attribute__((overloadable)) erfc(float);
401FN_FUNC_FN(erfc)
402
403extern float __attribute__((overloadable)) erf(float);
404FN_FUNC_FN(erf)
405
406extern float __attribute__((overloadable)) exp(float);
407FN_FUNC_FN(exp)
408
409extern float __attribute__((overloadable)) exp2(float);
410FN_FUNC_FN(exp2)
411
412extern float __attribute__((overloadable)) pow(float, float);
413
414extern float __attribute__((overloadable)) exp10(float v) {
415    return exp2(v * 3.321928095f);
416}
417FN_FUNC_FN(exp10)
418
419extern float __attribute__((overloadable)) expm1(float);
420FN_FUNC_FN(expm1)
421
422extern float __attribute__((overloadable)) fabs(float v) {
423    int i = *((int*)(void*)&v) & 0x7fffffff;
424    return  *((float*)(void*)&i);
425}
426FN_FUNC_FN(fabs)
427
428extern float __attribute__((overloadable)) fdim(float, float);
429FN_FUNC_FN_FN(fdim)
430
431extern float __attribute__((overloadable)) floor(float);
432FN_FUNC_FN(floor)
433
434extern float __attribute__((overloadable)) fma(float, float, float);
435FN_FUNC_FN_FN_FN(fma)
436
437extern float __attribute__((overloadable)) fmin(float, float);
438
439extern float __attribute__((overloadable)) fmod(float, float);
440FN_FUNC_FN_FN(fmod)
441
442extern float __attribute__((overloadable)) fract(float v, float *iptr) {
443    int i = (int)floor(v);
444    if (iptr) {
445        iptr[0] = i;
446    }
447    return fmin(v - i, 0x1.fffffep-1f);
448}
449FN_FUNC_FN_PFN(fract)
450
451extern float __attribute__((const, overloadable)) fract(float v) {
452    float unused;
453    return fract(v, &unused);
454}
455FN_FUNC_FN(fract)
456
457extern float __attribute__((overloadable)) frexp(float, int *);
458FN_FUNC_FN_PIN(frexp)
459
460extern float __attribute__((overloadable)) hypot(float, float);
461FN_FUNC_FN_FN(hypot)
462
463extern int __attribute__((overloadable)) ilogb(float);
464IN_FUNC_FN(ilogb)
465
466extern float __attribute__((overloadable)) ldexp(float, int);
467FN_FUNC_FN_IN(ldexp)
468FN_FUNC_FN_I(ldexp)
469
470extern float __attribute__((overloadable)) lgamma(float);
471FN_FUNC_FN(lgamma)
472extern float __attribute__((overloadable)) lgamma(float, int*);
473FN_FUNC_FN_PIN(lgamma)
474
475extern float __attribute__((overloadable)) log(float);
476FN_FUNC_FN(log)
477
478extern float __attribute__((overloadable)) log10(float);
479FN_FUNC_FN(log10)
480
481
482extern float __attribute__((overloadable)) log2(float v) {
483    return log10(v) * 3.321928095f;
484}
485FN_FUNC_FN(log2)
486
487extern float __attribute__((overloadable)) log1p(float);
488FN_FUNC_FN(log1p)
489
490extern float __attribute__((overloadable)) logb(float);
491FN_FUNC_FN(logb)
492
493extern float __attribute__((overloadable)) mad(float a, float b, float c) {
494    return a * b + c;
495}
496extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
497    return a * b + c;
498}
499extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
500    return a * b + c;
501}
502extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
503    return a * b + c;
504}
505
506extern float __attribute__((overloadable)) modf(float, float *);
507FN_FUNC_FN_PFN(modf);
508
509extern float __attribute__((overloadable)) nan(uint v) {
510    float f[1];
511    uint32_t *ip = (uint32_t *)f;
512    *ip = v | 0x7fc00000;
513    return f[0];
514}
515
516extern float __attribute__((overloadable)) nextafter(float, float);
517FN_FUNC_FN_FN(nextafter)
518
519FN_FUNC_FN_FN(pow)
520
521extern float __attribute__((overloadable)) pown(float v, int p) {
522    /* The mantissa of a float has fewer bits than an int (24 effective vs. 31).
523     * For very large ints, we'll lose whether the exponent is even or odd, making
524     * the selection of a correct sign incorrect.  We correct this.  Use copysign
525     * to handle the negative zero case.
526     */
527    float sign = (p & 0x1) ? copysign(1.f, v) : 1.f;
528    float f = pow(v, (float)p);
529    return copysign(f, sign);
530}
531FN_FUNC_FN_IN(pown)
532
533extern float __attribute__((overloadable)) powr(float v, float p) {
534    return pow(v, p);
535}
536extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
537    return pow(v, p);
538}
539extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
540    return pow(v, p);
541}
542extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
543    return pow(v, p);
544}
545
546extern float __attribute__((overloadable)) remainder(float, float);
547FN_FUNC_FN_FN(remainder)
548
549extern float __attribute__((overloadable)) remquo(float, float, int *);
550FN_FUNC_FN_FN_PIN(remquo)
551
552extern float __attribute__((overloadable)) rint(float);
553FN_FUNC_FN(rint)
554
555extern float __attribute__((overloadable)) rootn(float v, int r) {
556    if (r == 0) {
557        return posinf();
558    }
559
560    if (iszero(v)) {
561        if (r < 0) {
562            if (r & 1) {
563                return copysign(posinf(), v);
564            } else {
565                return posinf();
566            }
567        } else {
568            if (r & 1) {
569                return copysign(0.f, v);
570            } else {
571                return 0.f;
572            }
573        }
574    }
575
576    if (!isinf(v) && !isnan(v) && (v < 0.f)) {
577        if (r & 1) {
578            return (-1.f * pow(-1.f * v, 1.f / r));
579        } else {
580            return nan(0);
581        }
582    }
583
584    return pow(v, 1.f / r);
585}
586FN_FUNC_FN_IN(rootn);
587
588extern float __attribute__((overloadable)) round(float);
589FN_FUNC_FN(round)
590
591
592extern float __attribute__((overloadable)) sqrt(float);
593extern float __attribute__((overloadable)) rsqrt(float v) {
594    return 1.f / sqrt(v);
595}
596
597#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME)
598// These functions must be defined here if we are not using the SSE
599// implementation, which includes when we are built as part of the
600// debug runtime (libclcore_debug.bc).
601FN_FUNC_FN(sqrt)
602#else
603extern float2 __attribute__((overloadable)) sqrt(float2);
604extern float3 __attribute__((overloadable)) sqrt(float3);
605extern float4 __attribute__((overloadable)) sqrt(float4);
606#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME)
607
608FN_FUNC_FN(rsqrt)
609
610extern float __attribute__((overloadable)) sin(float);
611FN_FUNC_FN(sin)
612
613extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
614    *cosptr = cos(v);
615    return sin(v);
616}
617extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
618    *cosptr = cos(v);
619    return sin(v);
620}
621extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
622    *cosptr = cos(v);
623    return sin(v);
624}
625extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
626    *cosptr = cos(v);
627    return sin(v);
628}
629
630extern float __attribute__((overloadable)) sinh(float);
631FN_FUNC_FN(sinh)
632
633extern float __attribute__((overloadable)) sinpi(float v) {
634    return sin(v * M_PI);
635}
636FN_FUNC_FN(sinpi)
637
638extern float __attribute__((overloadable)) tan(float);
639FN_FUNC_FN(tan)
640
641extern float __attribute__((overloadable)) tanh(float);
642FN_FUNC_FN(tanh)
643
644extern float __attribute__((overloadable)) tanpi(float v) {
645    return tan(v * M_PI);
646}
647FN_FUNC_FN(tanpi)
648
649
650extern float __attribute__((overloadable)) tgamma(float);
651FN_FUNC_FN(tgamma)
652
653extern float __attribute__((overloadable)) trunc(float);
654FN_FUNC_FN(trunc)
655
656// Int ops (partial), 6.11.3
657
658#define XN_FUNC_YN(typeout, fnc, typein)                                \
659extern typeout __attribute__((overloadable)) fnc(typein);               \
660extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) {  \
661    typeout##2 r;                                                       \
662    r.x = fnc(v.x);                                                     \
663    r.y = fnc(v.y);                                                     \
664    return r;                                                           \
665}                                                                       \
666extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) {  \
667    typeout##3 r;                                                       \
668    r.x = fnc(v.x);                                                     \
669    r.y = fnc(v.y);                                                     \
670    r.z = fnc(v.z);                                                     \
671    return r;                                                           \
672}                                                                       \
673extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) {  \
674    typeout##4 r;                                                       \
675    r.x = fnc(v.x);                                                     \
676    r.y = fnc(v.y);                                                     \
677    r.z = fnc(v.z);                                                     \
678    r.w = fnc(v.w);                                                     \
679    return r;                                                           \
680}
681
682
683#define UIN_FUNC_IN(fnc)          \
684XN_FUNC_YN(uchar, fnc, char)      \
685XN_FUNC_YN(ushort, fnc, short)    \
686XN_FUNC_YN(uint, fnc, int)
687
688#define IN_FUNC_IN(fnc)           \
689XN_FUNC_YN(uchar, fnc, uchar)     \
690XN_FUNC_YN(char, fnc, char)       \
691XN_FUNC_YN(ushort, fnc, ushort)   \
692XN_FUNC_YN(short, fnc, short)     \
693XN_FUNC_YN(uint, fnc, uint)       \
694XN_FUNC_YN(int, fnc, int)
695
696
697#define XN_FUNC_XN_XN_BODY(type, fnc, body)         \
698extern type __attribute__((overloadable))       \
699        fnc(type v1, type v2) {                     \
700    return body;                                    \
701}                                                   \
702extern type##2 __attribute__((overloadable))    \
703        fnc(type##2 v1, type##2 v2) {               \
704    type##2 r;                                      \
705    r.x = fnc(v1.x, v2.x);                          \
706    r.y = fnc(v1.y, v2.y);                          \
707    return r;                                       \
708}                                                   \
709extern type##3 __attribute__((overloadable))    \
710        fnc(type##3 v1, type##3 v2) {               \
711    type##3 r;                                      \
712    r.x = fnc(v1.x, v2.x);                          \
713    r.y = fnc(v1.y, v2.y);                          \
714    r.z = fnc(v1.z, v2.z);                          \
715    return r;                                       \
716}                                                   \
717extern type##4 __attribute__((overloadable))    \
718        fnc(type##4 v1, type##4 v2) {               \
719    type##4 r;                                      \
720    r.x = fnc(v1.x, v2.x);                          \
721    r.y = fnc(v1.y, v2.y);                          \
722    r.z = fnc(v1.z, v2.z);                          \
723    r.w = fnc(v1.w, v2.w);                          \
724    return r;                                       \
725}
726
727#define IN_FUNC_IN_IN_BODY(fnc, body) \
728XN_FUNC_XN_XN_BODY(uchar, fnc, body)  \
729XN_FUNC_XN_XN_BODY(char, fnc, body)   \
730XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
731XN_FUNC_XN_XN_BODY(short, fnc, body)  \
732XN_FUNC_XN_XN_BODY(uint, fnc, body)   \
733XN_FUNC_XN_XN_BODY(int, fnc, body)    \
734XN_FUNC_XN_XN_BODY(float, fnc, body)
735
736
737/**
738 * abs
739 */
740extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
741    if (v < 0)
742        return -v;
743    return v;
744}
745extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
746    if (v < 0)
747        return -v;
748    return v;
749}
750extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
751    if (v < 0)
752        return -v;
753    return v;
754}
755
756/**
757 * clz
758 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
759 * expanded to 32 bits. For our smaller data types, we need to subtract off
760 * these unused top bits (that will be always be composed of zeros).
761 */
762extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
763    return __builtin_clz(v);
764}
765extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
766    return __builtin_clz(v) - 16;
767}
768extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
769    return __builtin_clz(v) - 24;
770}
771extern int32_t __attribute__((overloadable)) clz(int32_t v) {
772    return __builtin_clz(v);
773}
774extern int16_t __attribute__((overloadable)) clz(int16_t v) {
775    return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
776}
777extern int8_t __attribute__((overloadable)) clz(int8_t v) {
778    return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
779}
780
781
782UIN_FUNC_IN(abs)
783IN_FUNC_IN(clz)
784
785
786// 6.11.4
787
788
789extern float __attribute__((overloadable)) degrees(float radians) {
790    return radians * (180.f / M_PI);
791}
792extern float2 __attribute__((overloadable)) degrees(float2 radians) {
793    return radians * (180.f / M_PI);
794}
795extern float3 __attribute__((overloadable)) degrees(float3 radians) {
796    return radians * (180.f / M_PI);
797}
798extern float4 __attribute__((overloadable)) degrees(float4 radians) {
799    return radians * (180.f / M_PI);
800}
801
802extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
803    return start + (stop - start) * amount;
804}
805extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
806    return start + (stop - start) * amount;
807}
808extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
809    return start + (stop - start) * amount;
810}
811extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
812    return start + (stop - start) * amount;
813}
814extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
815    return start + (stop - start) * amount;
816}
817extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
818    return start + (stop - start) * amount;
819}
820extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
821    return start + (stop - start) * amount;
822}
823
824extern float __attribute__((overloadable)) radians(float degrees) {
825    return degrees * (M_PI / 180.f);
826}
827extern float2 __attribute__((overloadable)) radians(float2 degrees) {
828    return degrees * (M_PI / 180.f);
829}
830extern float3 __attribute__((overloadable)) radians(float3 degrees) {
831    return degrees * (M_PI / 180.f);
832}
833extern float4 __attribute__((overloadable)) radians(float4 degrees) {
834    return degrees * (M_PI / 180.f);
835}
836
837extern float __attribute__((overloadable)) step(float edge, float v) {
838    return (v < edge) ? 0.f : 1.f;
839}
840extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
841    float2 r;
842    r.x = (v.x < edge.x) ? 0.f : 1.f;
843    r.y = (v.y < edge.y) ? 0.f : 1.f;
844    return r;
845}
846extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
847    float3 r;
848    r.x = (v.x < edge.x) ? 0.f : 1.f;
849    r.y = (v.y < edge.y) ? 0.f : 1.f;
850    r.z = (v.z < edge.z) ? 0.f : 1.f;
851    return r;
852}
853extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
854    float4 r;
855    r.x = (v.x < edge.x) ? 0.f : 1.f;
856    r.y = (v.y < edge.y) ? 0.f : 1.f;
857    r.z = (v.z < edge.z) ? 0.f : 1.f;
858    r.w = (v.w < edge.w) ? 0.f : 1.f;
859    return r;
860}
861extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
862    float2 r;
863    r.x = (v < edge.x) ? 0.f : 1.f;
864    r.y = (v < edge.y) ? 0.f : 1.f;
865    return r;
866}
867extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
868    float3 r;
869    r.x = (v < edge.x) ? 0.f : 1.f;
870    r.y = (v < edge.y) ? 0.f : 1.f;
871    r.z = (v < edge.z) ? 0.f : 1.f;
872    return r;
873}
874extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
875    float4 r;
876    r.x = (v < edge.x) ? 0.f : 1.f;
877    r.y = (v < edge.y) ? 0.f : 1.f;
878    r.z = (v < edge.z) ? 0.f : 1.f;
879    r.w = (v < edge.w) ? 0.f : 1.f;
880    return r;
881}
882extern float2 __attribute__((overloadable)) step(float edge, float2 v) {
883    float2 r;
884    r.x = (v.x < edge) ? 0.f : 1.f;
885    r.y = (v.y < edge) ? 0.f : 1.f;
886    return r;
887}
888extern float3 __attribute__((overloadable)) step(float edge, float3 v) {
889    float3 r;
890    r.x = (v.x < edge) ? 0.f : 1.f;
891    r.y = (v.y < edge) ? 0.f : 1.f;
892    r.z = (v.z < edge) ? 0.f : 1.f;
893    return r;
894}
895extern float4 __attribute__((overloadable)) step(float edge, float4 v) {
896    float4 r;
897    r.x = (v.x < edge) ? 0.f : 1.f;
898    r.y = (v.y < edge) ? 0.f : 1.f;
899    r.z = (v.z < edge) ? 0.f : 1.f;
900    r.w = (v.w < edge) ? 0.f : 1.f;
901    return r;
902}
903
904extern float __attribute__((overloadable)) sign(float v) {
905    if (v > 0) return 1.f;
906    if (v < 0) return -1.f;
907    return v;
908}
909FN_FUNC_FN(sign)
910
911
912// 6.11.5
913extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
914    float3 r;
915    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
916    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
917    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
918    return r;
919}
920
921extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
922    float4 r;
923    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
924    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
925    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
926    r.w = 0.f;
927    return r;
928}
929
930#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME)
931// These functions must be defined here if we are not using the SSE
932// implementation, which includes when we are built as part of the
933// debug runtime (libclcore_debug.bc).
934
935extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
936    return lhs * rhs;
937}
938extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
939    return lhs.x*rhs.x + lhs.y*rhs.y;
940}
941extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
942    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
943}
944extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
945    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
946}
947
948extern float __attribute__((overloadable)) length(float v) {
949    return fabs(v);
950}
951extern float __attribute__((overloadable)) length(float2 v) {
952    return sqrt(v.x*v.x + v.y*v.y);
953}
954extern float __attribute__((overloadable)) length(float3 v) {
955    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
956}
957extern float __attribute__((overloadable)) length(float4 v) {
958    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
959}
960
961#else
962
963extern float __attribute__((overloadable)) length(float v);
964extern float __attribute__((overloadable)) length(float2 v);
965extern float __attribute__((overloadable)) length(float3 v);
966extern float __attribute__((overloadable)) length(float4 v);
967
968#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME)
969
970extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
971    return length(lhs - rhs);
972}
973extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
974    return length(lhs - rhs);
975}
976extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
977    return length(lhs - rhs);
978}
979extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
980    return length(lhs - rhs);
981}
982
983/* For the normalization functions, vectors of length 0 should simply be
984 * returned (i.e. all the components of that vector are 0).
985 */
986extern float __attribute__((overloadable)) normalize(float v) {
987    if (v == 0.0f) {
988        return 0.0f;
989    } else if (v < 0.0f) {
990        return -1.0f;
991    } else {
992        return 1.0f;
993    }
994}
995extern float2 __attribute__((overloadable)) normalize(float2 v) {
996    float l = length(v);
997    return l == 0.0f ? v : v / l;
998}
999extern float3 __attribute__((overloadable)) normalize(float3 v) {
1000    float l = length(v);
1001    return l == 0.0f ? v : v / l;
1002}
1003extern float4 __attribute__((overloadable)) normalize(float4 v) {
1004    float l = length(v);
1005    return l == 0.0f ? v : v / l;
1006}
1007
1008extern float __attribute__((overloadable)) half_sqrt(float v) {
1009    return sqrt(v);
1010}
1011FN_FUNC_FN(half_sqrt)
1012
1013extern float __attribute__((overloadable)) fast_length(float v) {
1014    return fabs(v);
1015}
1016extern float __attribute__((overloadable)) fast_length(float2 v) {
1017    return half_sqrt(v.x*v.x + v.y*v.y);
1018}
1019extern float __attribute__((overloadable)) fast_length(float3 v) {
1020    return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1021}
1022extern float __attribute__((overloadable)) fast_length(float4 v) {
1023    return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1024}
1025
1026extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
1027    return fast_length(lhs - rhs);
1028}
1029extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
1030    return fast_length(lhs - rhs);
1031}
1032extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
1033    return fast_length(lhs - rhs);
1034}
1035extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
1036    return fast_length(lhs - rhs);
1037}
1038
1039extern float __attribute__((overloadable)) half_rsqrt(float);
1040
1041/* For the normalization functions, vectors of length 0 should simply be
1042 * returned (i.e. all the components of that vector are 0).
1043 */
1044extern float __attribute__((overloadable)) fast_normalize(float v) {
1045    if (v == 0.0f) {
1046        return 0.0f;
1047    } else if (v < 0.0f) {
1048        return -1.0f;
1049    } else {
1050        return 1.0f;
1051    }
1052}
1053// If the length is 0, then rlength should be NaN.
1054extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
1055    float rlength = half_rsqrt(v.x*v.x + v.y*v.y);
1056    return (rlength == rlength) ? v * rlength : v;
1057}
1058extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
1059    float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1060    return (rlength == rlength) ? v * rlength : v;
1061}
1062extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
1063    float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1064    return (rlength == rlength) ? v * rlength : v;
1065}
1066
1067extern float __attribute__((overloadable)) half_recip(float v) {
1068    return 1.f / v;
1069}
1070
1071/*
1072extern float __attribute__((overloadable)) approx_atan(float x) {
1073    if (x == 0.f)
1074        return 0.f;
1075    if (x < 0.f)
1076        return -1.f * approx_atan(-1.f * x);
1077    if (x > 1.f)
1078        return M_PI_2 - approx_atan(approx_recip(x));
1079    return x * approx_recip(1.f + 0.28f * x*x);
1080}
1081FN_FUNC_FN(approx_atan)
1082*/
1083
1084typedef union
1085{
1086  float fv;
1087  int32_t iv;
1088} ieee_float_shape_type;
1089
1090/* Get a 32 bit int from a float.  */
1091
1092#define GET_FLOAT_WORD(i,d)                 \
1093do {                                \
1094  ieee_float_shape_type gf_u;                   \
1095  gf_u.fv = (d);                     \
1096  (i) = gf_u.iv;                      \
1097} while (0)
1098
1099/* Set a float from a 32 bit int.  */
1100
1101#define SET_FLOAT_WORD(d,i)                 \
1102do {                                \
1103  ieee_float_shape_type sf_u;                   \
1104  sf_u.iv = (i);                      \
1105  (d) = sf_u.fv;                     \
1106} while (0)
1107
1108
1109
1110// Valid -125 to 125
1111extern float __attribute__((overloadable)) native_exp2(float v) {
1112    int32_t iv = (int)v;
1113    int32_t x = iv + (iv >> 31); // ~floor(v)
1114    float r = (v - x);
1115
1116    float fo;
1117    SET_FLOAT_WORD(fo, (x + 127) << 23);
1118
1119    r *= 0.694f; // ~ log(e) / log(2)
1120    float r2 = r*r;
1121    float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1122    return fo * adj;
1123}
1124
1125extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
1126    int2 iv = convert_int2(v);
1127    int2 x = iv + (iv >> (int2)31);//floor(v);
1128    float2 r = (v - convert_float2(x));
1129
1130    x += 127;
1131
1132    float2 fo = (float2)(x << (int2)23);
1133
1134    r *= 0.694f; // ~ log(e) / log(2)
1135    float2 r2 = r*r;
1136    float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1137    return fo * adj;
1138}
1139
1140extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
1141    int4 iv = convert_int4(v);
1142    int4 x = iv + (iv >> (int4)31);//floor(v);
1143    float4 r = (v - convert_float4(x));
1144
1145    x += 127;
1146
1147    float4 fo = (float4)(x << (int4)23);
1148
1149    r *= 0.694f; // ~ log(e) / log(2)
1150    float4 r2 = r*r;
1151    float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1152    return fo * adj;
1153}
1154
1155extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
1156    float4 t = 1.f;
1157    t.xyz = v;
1158    return native_exp2(t).xyz;
1159}
1160
1161
1162extern float __attribute__((overloadable)) native_exp(float v) {
1163    return native_exp2(v * 1.442695041f);
1164}
1165extern float2 __attribute__((overloadable)) native_exp(float2 v) {
1166    return native_exp2(v * 1.442695041f);
1167}
1168extern float3 __attribute__((overloadable)) native_exp(float3 v) {
1169    return native_exp2(v * 1.442695041f);
1170}
1171extern float4 __attribute__((overloadable)) native_exp(float4 v) {
1172    return native_exp2(v * 1.442695041f);
1173}
1174
1175extern float __attribute__((overloadable)) native_exp10(float v) {
1176    return native_exp2(v * 3.321928095f);
1177}
1178extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
1179    return native_exp2(v * 3.321928095f);
1180}
1181extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
1182    return native_exp2(v * 3.321928095f);
1183}
1184extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
1185    return native_exp2(v * 3.321928095f);
1186}
1187
1188extern float __attribute__((overloadable)) native_log2(float v) {
1189    int32_t ibits;
1190    GET_FLOAT_WORD(ibits, v);
1191
1192    int32_t e = (ibits >> 23) & 0xff;
1193
1194    ibits &= 0x7fffff;
1195    ibits |= 127 << 23;
1196
1197    float ir;
1198    SET_FLOAT_WORD(ir, ibits);
1199    ir -= 1.5f;
1200    float ir2 = ir*ir;
1201    float adj2 = (0.405465108f / 0.693147181f) +
1202                 ((0.666666667f / 0.693147181f) * ir) -
1203                 ((0.222222222f / 0.693147181f) * ir2) +
1204                 ((0.098765432f / 0.693147181f) * ir*ir2) -
1205                 ((0.049382716f / 0.693147181f) * ir2*ir2) +
1206                 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) -
1207                 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2);
1208    return (float)(e - 127) + adj2;
1209}
1210extern float2 __attribute__((overloadable)) native_log2(float2 v) {
1211    float2 v2 = {native_log2(v.x), native_log2(v.y)};
1212    return v2;
1213}
1214extern float3 __attribute__((overloadable)) native_log2(float3 v) {
1215    float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
1216    return v2;
1217}
1218extern float4 __attribute__((overloadable)) native_log2(float4 v) {
1219    float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
1220    return v2;
1221}
1222
1223extern float __attribute__((overloadable)) native_log(float v) {
1224    return native_log2(v) * (1.f / 1.442695041f);
1225}
1226extern float2 __attribute__((overloadable)) native_log(float2 v) {
1227    return native_log2(v) * (1.f / 1.442695041f);
1228}
1229extern float3 __attribute__((overloadable)) native_log(float3 v) {
1230    return native_log2(v) * (1.f / 1.442695041f);
1231}
1232extern float4 __attribute__((overloadable)) native_log(float4 v) {
1233    return native_log2(v) * (1.f / 1.442695041f);
1234}
1235
1236extern float __attribute__((overloadable)) native_log10(float v) {
1237    return native_log2(v) * (1.f / 3.321928095f);
1238}
1239extern float2 __attribute__((overloadable)) native_log10(float2 v) {
1240    return native_log2(v) * (1.f / 3.321928095f);
1241}
1242extern float3 __attribute__((overloadable)) native_log10(float3 v) {
1243    return native_log2(v) * (1.f / 3.321928095f);
1244}
1245extern float4 __attribute__((overloadable)) native_log10(float4 v) {
1246    return native_log2(v) * (1.f / 3.321928095f);
1247}
1248
1249
1250extern float __attribute__((overloadable)) native_powr(float v, float y) {
1251    float v2 = native_log2(v);
1252    v2 = fmax(v2 * y, -125.f);
1253    return native_exp2(v2);
1254}
1255extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
1256    float2 v2 = native_log2(v);
1257    v2 = fmax(v2 * y, -125.f);
1258    return native_exp2(v2);
1259}
1260extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
1261    float3 v2 = native_log2(v);
1262    v2 = fmax(v2 * y, -125.f);
1263    return native_exp2(v2);
1264}
1265extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
1266    float4 v2 = native_log2(v);
1267    v2 = fmax(v2 * y, -125.f);
1268    return native_exp2(v2);
1269}
1270
1271extern double __attribute__((overloadable)) min(double v1, double v2) {
1272    return v1 < v2 ? v1 : v2;
1273}
1274
1275extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) {
1276    double2 r;
1277    r.x = v1.x < v2.x ? v1.x : v2.x;
1278    r.y = v1.y < v2.y ? v1.y : v2.y;
1279    return r;
1280}
1281
1282extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) {
1283    double3 r;
1284    r.x = v1.x < v2.x ? v1.x : v2.x;
1285    r.y = v1.y < v2.y ? v1.y : v2.y;
1286    r.z = v1.z < v2.z ? v1.z : v2.z;
1287    return r;
1288}
1289
1290extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) {
1291    double4 r;
1292    r.x = v1.x < v2.x ? v1.x : v2.x;
1293    r.y = v1.y < v2.y ? v1.y : v2.y;
1294    r.z = v1.z < v2.z ? v1.z : v2.z;
1295    r.w = v1.w < v2.w ? v1.w : v2.w;
1296    return r;
1297}
1298
1299extern long __attribute__((overloadable)) min(long v1, long v2) {
1300    return v1 < v2 ? v1 : v2;
1301}
1302extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
1303    long2 r;
1304    r.x = v1.x < v2.x ? v1.x : v2.x;
1305    r.y = v1.y < v2.y ? v1.y : v2.y;
1306    return r;
1307}
1308extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
1309    long3 r;
1310    r.x = v1.x < v2.x ? v1.x : v2.x;
1311    r.y = v1.y < v2.y ? v1.y : v2.y;
1312    r.z = v1.z < v2.z ? v1.z : v2.z;
1313    return r;
1314}
1315extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
1316    long4 r;
1317    r.x = v1.x < v2.x ? v1.x : v2.x;
1318    r.y = v1.y < v2.y ? v1.y : v2.y;
1319    r.z = v1.z < v2.z ? v1.z : v2.z;
1320    r.w = v1.w < v2.w ? v1.w : v2.w;
1321    return r;
1322}
1323
1324extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
1325    return v1 < v2 ? v1 : v2;
1326}
1327extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
1328    ulong2 r;
1329    r.x = v1.x < v2.x ? v1.x : v2.x;
1330    r.y = v1.y < v2.y ? v1.y : v2.y;
1331    return r;
1332}
1333extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
1334    ulong3 r;
1335    r.x = v1.x < v2.x ? v1.x : v2.x;
1336    r.y = v1.y < v2.y ? v1.y : v2.y;
1337    r.z = v1.z < v2.z ? v1.z : v2.z;
1338    return r;
1339}
1340extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
1341    ulong4 r;
1342    r.x = v1.x < v2.x ? v1.x : v2.x;
1343    r.y = v1.y < v2.y ? v1.y : v2.y;
1344    r.z = v1.z < v2.z ? v1.z : v2.z;
1345    r.w = v1.w < v2.w ? v1.w : v2.w;
1346    return r;
1347}
1348
1349extern double __attribute__((overloadable)) max(double v1, double v2) {
1350    return v1 > v2 ? v1 : v2;
1351}
1352
1353extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) {
1354    double2 r;
1355    r.x = v1.x > v2.x ? v1.x : v2.x;
1356    r.y = v1.y > v2.y ? v1.y : v2.y;
1357    return r;
1358}
1359
1360extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) {
1361    double3 r;
1362    r.x = v1.x > v2.x ? v1.x : v2.x;
1363    r.y = v1.y > v2.y ? v1.y : v2.y;
1364    r.z = v1.z > v2.z ? v1.z : v2.z;
1365    return r;
1366}
1367
1368extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) {
1369    double4 r;
1370    r.x = v1.x > v2.x ? v1.x : v2.x;
1371    r.y = v1.y > v2.y ? v1.y : v2.y;
1372    r.z = v1.z > v2.z ? v1.z : v2.z;
1373    r.w = v1.w > v2.w ? v1.w : v2.w;
1374    return r;
1375}
1376
1377extern long __attribute__((overloadable)) max(long v1, long v2) {
1378    return v1 > v2 ? v1 : v2;
1379}
1380extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
1381    long2 r;
1382    r.x = v1.x > v2.x ? v1.x : v2.x;
1383    r.y = v1.y > v2.y ? v1.y : v2.y;
1384    return r;
1385}
1386extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
1387    long3 r;
1388    r.x = v1.x > v2.x ? v1.x : v2.x;
1389    r.y = v1.y > v2.y ? v1.y : v2.y;
1390    r.z = v1.z > v2.z ? v1.z : v2.z;
1391    return r;
1392}
1393extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
1394    long4 r;
1395    r.x = v1.x > v2.x ? v1.x : v2.x;
1396    r.y = v1.y > v2.y ? v1.y : v2.y;
1397    r.z = v1.z > v2.z ? v1.z : v2.z;
1398    r.w = v1.w > v2.w ? v1.w : v2.w;
1399    return r;
1400}
1401
1402extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
1403    return v1 > v2 ? v1 : v2;
1404}
1405extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
1406    ulong2 r;
1407    r.x = v1.x > v2.x ? v1.x : v2.x;
1408    r.y = v1.y > v2.y ? v1.y : v2.y;
1409    return r;
1410}
1411extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
1412    ulong3 r;
1413    r.x = v1.x > v2.x ? v1.x : v2.x;
1414    r.y = v1.y > v2.y ? v1.y : v2.y;
1415    r.z = v1.z > v2.z ? v1.z : v2.z;
1416    return r;
1417}
1418extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
1419    ulong4 r;
1420    r.x = v1.x > v2.x ? v1.x : v2.x;
1421    r.y = v1.y > v2.y ? v1.y : v2.y;
1422    r.z = v1.z > v2.z ? v1.z : v2.z;
1423    r.w = v1.w > v2.w ? v1.w : v2.w;
1424    return r;
1425}
1426
1427#define THUNK_NATIVE_F(fn) \
1428    float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
1429    float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
1430    float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
1431    float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
1432
1433#define THUNK_NATIVE_F_F(fn) \
1434    float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
1435    float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
1436    float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
1437    float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
1438
1439#define THUNK_NATIVE_F_FP(fn) \
1440    float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
1441    float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
1442    float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
1443    float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
1444
1445#define THUNK_NATIVE_F_I(fn) \
1446    float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
1447    float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
1448    float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
1449    float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
1450
1451THUNK_NATIVE_F(acos)
1452THUNK_NATIVE_F(acosh)
1453THUNK_NATIVE_F(acospi)
1454THUNK_NATIVE_F(asin)
1455THUNK_NATIVE_F(asinh)
1456THUNK_NATIVE_F(asinpi)
1457THUNK_NATIVE_F(atan)
1458THUNK_NATIVE_F_F(atan2)
1459THUNK_NATIVE_F(atanh)
1460THUNK_NATIVE_F(atanpi)
1461THUNK_NATIVE_F_F(atan2pi)
1462THUNK_NATIVE_F(cbrt)
1463THUNK_NATIVE_F(cos)
1464THUNK_NATIVE_F(cosh)
1465THUNK_NATIVE_F(cospi)
1466THUNK_NATIVE_F(expm1)
1467THUNK_NATIVE_F_F(hypot)
1468THUNK_NATIVE_F(log1p)
1469THUNK_NATIVE_F_I(rootn)
1470THUNK_NATIVE_F(rsqrt)
1471THUNK_NATIVE_F(sqrt)
1472THUNK_NATIVE_F(sin)
1473THUNK_NATIVE_F_FP(sincos)
1474THUNK_NATIVE_F(sinh)
1475THUNK_NATIVE_F(sinpi)
1476THUNK_NATIVE_F(tan)
1477THUNK_NATIVE_F(tanh)
1478THUNK_NATIVE_F(tanpi)
1479
1480#undef THUNK_NATIVE_F
1481#undef THUNK_NATIVE_F_F
1482#undef THUNK_NATIVE_F_I
1483#undef THUNK_NATIVE_F_FP
1484
1485float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
1486float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
1487float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
1488float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
1489
1490float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
1491float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
1492float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
1493float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
1494
1495float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
1496float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
1497float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
1498float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
1499
1500float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
1501float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
1502float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
1503float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
1504
1505float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
1506float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
1507float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
1508float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
1509
1510
1511
1512
1513
1514#undef FN_FUNC_FN
1515#undef IN_FUNC_FN
1516#undef FN_FUNC_FN_FN
1517#undef FN_FUNC_FN_F
1518#undef FN_FUNC_FN_IN
1519#undef FN_FUNC_FN_I
1520#undef FN_FUNC_FN_PFN
1521#undef FN_FUNC_FN_PIN
1522#undef FN_FUNC_FN_FN_FN
1523#undef FN_FUNC_FN_FN_PIN
1524#undef XN_FUNC_YN
1525#undef UIN_FUNC_IN
1526#undef IN_FUNC_IN
1527#undef XN_FUNC_XN_XN_BODY
1528#undef IN_FUNC_IN_IN_BODY
1529
1530typedef union {
1531  half hval;
1532  short sval;
1533} fp16_shape_type;
1534
1535/* half h = unsigned short s; */
1536#define SET_HALF_WORD(h, s) \
1537do {                        \
1538  fp16_shape_type fp16_u;   \
1539  fp16_u.sval = (s);        \
1540  (h) = fp16_u.hval;        \
1541} while (0)
1542
1543static const unsigned short kHalfPositiveInfinity = 0x7c00;
1544
1545/* Define f16 functions of the form
1546 *     HN output = fn(HN input)
1547 * where HN is scalar or vector half type
1548 */
1549#define HN_FUNC_HN(fn)                                                    \
1550extern half __attribute__((overloadable)) fn(half h) {                    \
1551    return (half) fn((float) h);                                          \
1552}                                                                         \
1553extern half2 __attribute__((overloadable)) fn(half2 v) {                  \
1554  return convert_half2(fn(convert_float2(v)));                            \
1555}                                                                         \
1556extern half3 __attribute__((overloadable)) fn(half3 v) {                  \
1557  return convert_half3(fn(convert_float3(v)));                            \
1558}                                                                         \
1559extern half4 __attribute__((overloadable)) fn(half4 v) {                  \
1560  return convert_half4(fn(convert_float4(v)));                            \
1561}
1562
1563/* Define f16 functions of the form
1564 *     HN output = fn(HN input1, HN input2)
1565 * where HN is scalar or vector half type
1566 */
1567#define HN_FUNC_HN_HN(fn)                                                 \
1568extern half __attribute__((overloadable)) fn(half h1, half h2) {          \
1569    return (half) fn((float) h1, (float) h2);                             \
1570}                                                                         \
1571extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) {       \
1572  return convert_half2(fn(convert_float2(v1),                             \
1573                          convert_float2(v2)));                           \
1574}                                                                         \
1575extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) {       \
1576  return convert_half3(fn(convert_float3(v1),                             \
1577                          convert_float3(v2)));                           \
1578}                                                                         \
1579extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) {       \
1580  return convert_half4(fn(convert_float4(v1),                             \
1581                          convert_float4(v2)));                           \
1582}
1583
1584/* Define f16 functions of the form
1585 *     HN output = fn(HN input1, half input2)
1586 * where HN is scalar or vector half type
1587 */
1588#define HN_FUNC_HN_H(fn)                                                  \
1589extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) {        \
1590  return convert_half2(fn(convert_float2(v1), (float) v2));               \
1591}                                                                         \
1592extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) {        \
1593  return convert_half3(fn(convert_float3(v1), (float) v2));               \
1594}                                                                         \
1595extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) {        \
1596  return convert_half4(fn(convert_float4(v1), (float) v2));               \
1597}
1598
1599/* Define f16 functions of the form
1600 *     HN output = fn(HN input1, HN input2, HN input3)
1601 * where HN is scalar or vector half type
1602 */
1603#define HN_FUNC_HN_HN_HN(fn)                                                   \
1604extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) {      \
1605    return (half) fn((float) h1, (float) h2, (float) h3);                      \
1606}                                                                              \
1607extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) {  \
1608  return convert_half2(fn(convert_float2(v1),                                  \
1609                          convert_float2(v2),                                  \
1610                          convert_float2(v3)));                                \
1611}                                                                              \
1612extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) {  \
1613  return convert_half3(fn(convert_float3(v1),                                  \
1614                          convert_float3(v2),                                  \
1615                          convert_float3(v3)));                                \
1616}                                                                              \
1617extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) {  \
1618  return convert_half4(fn(convert_float4(v1),                                  \
1619                          convert_float4(v2),                                  \
1620                          convert_float4(v3)));                                \
1621}
1622
1623/* Define f16 functions of the form
1624 *     HN output = fn(HN input1, IN input2)
1625 * where HN is scalar or vector half type and IN the equivalent integer type
1626 * of same vector length.
1627 */
1628#define HN_FUNC_HN_IN(fn)                                                 \
1629extern half __attribute__((overloadable)) fn(half h1, int v) {            \
1630    return (half) fn((float) h1, v);                                      \
1631}                                                                         \
1632extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) {        \
1633  return convert_half2(fn(convert_float2(v1), v2));                       \
1634}                                                                         \
1635extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) {        \
1636  return convert_half3(fn(convert_float3(v1), v2));                       \
1637}                                                                         \
1638extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) {        \
1639  return convert_half4(fn(convert_float4(v1), v2));                       \
1640}
1641
1642/* Define f16 functions of the form
1643 *     half output = fn(HN input1)
1644 * where HN is a scalar or vector half type.
1645 */
1646#define H_FUNC_HN(fn)                                                     \
1647extern half __attribute__((overloadable)) fn(half h) {                    \
1648    return (half) fn((float) h);                                          \
1649}                                                                         \
1650extern half __attribute__((overloadable)) fn(half2 v) {                   \
1651  return fn(convert_float2(v));                                           \
1652}                                                                         \
1653extern half __attribute__((overloadable)) fn(half3 v) {                   \
1654  return fn(convert_float3(v));                                           \
1655}                                                                         \
1656extern half __attribute__((overloadable)) fn(half4 v) {                   \
1657  return fn(convert_float4(v));                                           \
1658}
1659
1660/* Define f16 functions of the form
1661 *     half output = fn(HN input1, HN input2)
1662 * where HN is a scalar or vector half type.
1663 */
1664#define H_FUNC_HN_HN(fn)                                                  \
1665extern half __attribute__((overloadable)) fn(half h1, half h2) {          \
1666    return (half) fn((float) h1, (float) h2);                             \
1667}                                                                         \
1668extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) {        \
1669  return fn(convert_float2(v1), convert_float2(v2));                      \
1670}                                                                         \
1671extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) {        \
1672  return fn(convert_float3(v1), convert_float3(v2));                      \
1673}                                                                         \
1674extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) {        \
1675  return fn(convert_float4(v1), convert_float4(v2));                      \
1676}
1677
1678/* Define f16 functions of the form
1679 *     HN output = fn(HN input1, HN input2)
1680 * where HN is a vector half type.  The functions are defined to call the
1681 * scalar function of the same name.
1682 */
1683#define SCALARIZE_HN_FUNC_HN_HN(fn)                                       \
1684extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) {       \
1685  half2 ret;                                                              \
1686  ret.x = fn(v1.x, v2.x);                                                 \
1687  ret.y = fn(v1.y, v2.y);                                                 \
1688  return ret;                                                             \
1689}                                                                         \
1690extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) {       \
1691  half3 ret;                                                              \
1692  ret.x = fn(v1.x, v2.x);                                                 \
1693  ret.y = fn(v1.y, v2.y);                                                 \
1694  ret.z = fn(v1.z, v2.z);                                                 \
1695  return ret;                                                             \
1696}                                                                         \
1697extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) {       \
1698  half4 ret;                                                              \
1699  ret.x = fn(v1.x, v2.x);                                                 \
1700  ret.y = fn(v1.y, v2.y);                                                 \
1701  ret.z = fn(v1.z, v2.z);                                                 \
1702  ret.w = fn(v1.w, v2.w);                                                 \
1703  return ret;                                                             \
1704}                                                                         \
1705
1706HN_FUNC_HN(acos);
1707HN_FUNC_HN(acosh);
1708HN_FUNC_HN(acospi);
1709HN_FUNC_HN(asin);
1710HN_FUNC_HN(asinh);
1711HN_FUNC_HN(asinpi);
1712HN_FUNC_HN(atan);
1713HN_FUNC_HN(atanh);
1714HN_FUNC_HN(atanpi);
1715HN_FUNC_HN_HN(atan2);
1716HN_FUNC_HN_HN(atan2pi);
1717
1718HN_FUNC_HN(cbrt);
1719HN_FUNC_HN(ceil);
1720
1721// TODO Add copysign
1722
1723HN_FUNC_HN(cos);
1724HN_FUNC_HN(cosh);
1725HN_FUNC_HN(cospi);
1726
1727extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) {
1728    half3 r;
1729    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
1730    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
1731    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
1732    return r;
1733}
1734
1735extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) {
1736    half4 r;
1737    r.x = lhs.y * rhs.z  - lhs.z * rhs.y;
1738    r.y = lhs.z * rhs.x  - lhs.x * rhs.z;
1739    r.z = lhs.x * rhs.y  - lhs.y * rhs.x;
1740    r.w = 0.f;
1741    return r;
1742}
1743
1744HN_FUNC_HN(degrees);
1745H_FUNC_HN_HN(distance);
1746H_FUNC_HN_HN(dot);
1747
1748HN_FUNC_HN(erf);
1749HN_FUNC_HN(erfc);
1750HN_FUNC_HN(exp);
1751HN_FUNC_HN(exp10);
1752HN_FUNC_HN(exp2);
1753HN_FUNC_HN(expm1);
1754
1755HN_FUNC_HN(fabs);
1756HN_FUNC_HN_HN(fdim);
1757HN_FUNC_HN(floor);
1758HN_FUNC_HN_HN_HN(fma);
1759HN_FUNC_HN_HN(fmax);
1760HN_FUNC_HN_H(fmax);
1761HN_FUNC_HN_HN(fmin);
1762HN_FUNC_HN_H(fmin);
1763HN_FUNC_HN_HN(fmod);
1764
1765// TODO Add (both variants) of fract
1766// TODO Add frexp
1767
1768HN_FUNC_HN_HN(hypot);
1769
1770// TODO Add ilogb
1771
1772HN_FUNC_HN_IN(ldexp);
1773extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) {
1774    return convert_half2(ldexp(convert_float2(v), exponent));
1775}
1776extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) {
1777    return convert_half3(ldexp(convert_float3(v), exponent));
1778}
1779extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) {
1780    return convert_half4(ldexp(convert_float4(v), exponent));
1781}
1782
1783H_FUNC_HN(length);
1784HN_FUNC_HN(lgamma);
1785
1786extern half __attribute__((overloadable)) lgamma(half h, int *signp) {
1787    return (half) lgamma((float) h, signp);
1788}
1789extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) {
1790    return convert_half2(lgamma(convert_float2(v), signp));
1791}
1792extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) {
1793    return convert_half3(lgamma(convert_float3(v), signp));
1794}
1795extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) {
1796    return convert_half4(lgamma(convert_float4(v), signp));
1797}
1798
1799HN_FUNC_HN(log);
1800HN_FUNC_HN(log10);
1801HN_FUNC_HN(log1p);
1802HN_FUNC_HN(log2);
1803HN_FUNC_HN(logb);
1804
1805HN_FUNC_HN_HN_HN(mad);
1806HN_FUNC_HN_HN(max);
1807HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff?
1808HN_FUNC_HN_HN(min);
1809HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff?
1810
1811extern half __attribute__((overloadable)) mix(half start, half stop, half amount) {
1812    return start + (stop - start) * amount;
1813}
1814extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) {
1815    return start + (stop - start) * amount;
1816}
1817extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) {
1818    return start + (stop - start) * amount;
1819}
1820extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) {
1821    return start + (stop - start) * amount;
1822}
1823extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) {
1824    return start + (stop - start) * amount;
1825}
1826extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) {
1827    return start + (stop - start) * amount;
1828}
1829extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) {
1830    return start + (stop - start) * amount;
1831}
1832
1833// TODO Define modf.  Does it make sense to delegate to the float?
1834
1835half __attribute__((overloadable)) nan_half() {
1836  unsigned short nan_short = kHalfPositiveInfinity | 0x0200;
1837  half nan;
1838  SET_HALF_WORD(nan, nan_short);
1839  return nan;
1840}
1841
1842// TODO Add nextafter
1843
1844HN_FUNC_HN(normalize);
1845
1846HN_FUNC_HN_HN(pow);
1847HN_FUNC_HN_IN(pown);
1848HN_FUNC_HN_HN(powr);
1849HN_FUNC_HN(radians);
1850HN_FUNC_HN_HN(remainder);
1851
1852extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) {
1853    return (float) remquo((float) n, (float) d, quo);
1854}
1855extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) {
1856    return convert_half2(remquo(convert_float2(d), convert_float2(n), quo));
1857}
1858extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) {
1859    return convert_half3(remquo(convert_float3(d), convert_float3(n), quo));
1860}
1861extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) {
1862    return convert_half4(remquo(convert_float4(d), convert_float4(n), quo));
1863}
1864
1865HN_FUNC_HN(rint);
1866HN_FUNC_HN_IN(rootn);
1867HN_FUNC_HN(round);
1868HN_FUNC_HN(rsqrt);
1869
1870extern half __attribute__((overloadable)) sign(half h) {
1871    if (h > 0) return (half) 1.f;
1872    if (h < 0) return (half) -1.f;
1873    return h;
1874}
1875extern half2 __attribute__((overloadable)) sign(half2 v) {
1876    half2 ret;
1877    ret.x = sign(v.x);
1878    ret.y = sign(v.y);
1879    return ret;
1880}
1881extern half3 __attribute__((overloadable)) sign(half3 v) {
1882    half3 ret;
1883    ret.x = sign(v.x);
1884    ret.y = sign(v.y);
1885    ret.z = sign(v.z);
1886    return ret;
1887}
1888extern half4 __attribute__((overloadable)) sign(half4 v) {
1889    half4 ret;
1890    ret.x = sign(v.x);
1891    ret.y = sign(v.y);
1892    ret.z = sign(v.z);
1893    ret.w = sign(v.w);
1894    return ret;
1895}
1896
1897HN_FUNC_HN(sin);
1898
1899extern half __attribute__((overloadable)) sincos(half v, half *cosptr) {
1900    *cosptr = cos(v);
1901    return sin(v);
1902}
1903// TODO verify if LLVM eliminates the duplicate convert_float2
1904extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) {
1905    *cosptr = cos(v);
1906    return sin(v);
1907}
1908extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) {
1909    *cosptr = cos(v);
1910    return sin(v);
1911}
1912extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) {
1913    *cosptr = cos(v);
1914    return sin(v);
1915}
1916
1917HN_FUNC_HN(sinh);
1918HN_FUNC_HN(sinpi);
1919HN_FUNC_HN(sqrt);
1920
1921extern half __attribute__((overloadable)) step(half edge, half v) {
1922    return (v < edge) ? 0.f : 1.f;
1923}
1924extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) {
1925    half2 r;
1926    r.x = (v.x < edge.x) ? 0.f : 1.f;
1927    r.y = (v.y < edge.y) ? 0.f : 1.f;
1928    return r;
1929}
1930extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) {
1931    half3 r;
1932    r.x = (v.x < edge.x) ? 0.f : 1.f;
1933    r.y = (v.y < edge.y) ? 0.f : 1.f;
1934    r.z = (v.z < edge.z) ? 0.f : 1.f;
1935    return r;
1936}
1937extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) {
1938    half4 r;
1939    r.x = (v.x < edge.x) ? 0.f : 1.f;
1940    r.y = (v.y < edge.y) ? 0.f : 1.f;
1941    r.z = (v.z < edge.z) ? 0.f : 1.f;
1942    r.w = (v.w < edge.w) ? 0.f : 1.f;
1943    return r;
1944}
1945extern half2 __attribute__((overloadable)) step(half2 edge, half v) {
1946    half2 r;
1947    r.x = (v < edge.x) ? 0.f : 1.f;
1948    r.y = (v < edge.y) ? 0.f : 1.f;
1949    return r;
1950}
1951extern half3 __attribute__((overloadable)) step(half3 edge, half v) {
1952    half3 r;
1953    r.x = (v < edge.x) ? 0.f : 1.f;
1954    r.y = (v < edge.y) ? 0.f : 1.f;
1955    r.z = (v < edge.z) ? 0.f : 1.f;
1956    return r;
1957}
1958extern half4 __attribute__((overloadable)) step(half4 edge, half v) {
1959    half4 r;
1960    r.x = (v < edge.x) ? 0.f : 1.f;
1961    r.y = (v < edge.y) ? 0.f : 1.f;
1962    r.z = (v < edge.z) ? 0.f : 1.f;
1963    r.w = (v < edge.w) ? 0.f : 1.f;
1964    return r;
1965}
1966extern half2 __attribute__((overloadable)) step(half edge, half2 v) {
1967    half2 r;
1968    r.x = (v.x < edge) ? 0.f : 1.f;
1969    r.y = (v.y < edge) ? 0.f : 1.f;
1970    return r;
1971}
1972extern half3 __attribute__((overloadable)) step(half edge, half3 v) {
1973    half3 r;
1974    r.x = (v.x < edge) ? 0.f : 1.f;
1975    r.y = (v.y < edge) ? 0.f : 1.f;
1976    r.z = (v.z < edge) ? 0.f : 1.f;
1977    return r;
1978}
1979extern half4 __attribute__((overloadable)) step(half edge, half4 v) {
1980    half4 r;
1981    r.x = (v.x < edge) ? 0.f : 1.f;
1982    r.y = (v.y < edge) ? 0.f : 1.f;
1983    r.z = (v.z < edge) ? 0.f : 1.f;
1984    r.w = (v.w < edge) ? 0.f : 1.f;
1985    return r;
1986}
1987
1988HN_FUNC_HN(tan);
1989HN_FUNC_HN(tanh);
1990HN_FUNC_HN(tanpi);
1991HN_FUNC_HN(tgamma);
1992HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation?
1993
1994HN_FUNC_HN(native_acos);
1995HN_FUNC_HN(native_acosh);
1996HN_FUNC_HN(native_acospi);
1997HN_FUNC_HN(native_asin);
1998HN_FUNC_HN(native_asinh);
1999HN_FUNC_HN(native_asinpi);
2000HN_FUNC_HN(native_atan);
2001HN_FUNC_HN(native_atanh);
2002HN_FUNC_HN(native_atanpi);
2003HN_FUNC_HN_HN(native_atan2);
2004HN_FUNC_HN_HN(native_atan2pi);
2005
2006HN_FUNC_HN(native_cbrt);
2007HN_FUNC_HN(native_cos);
2008HN_FUNC_HN(native_cosh);
2009HN_FUNC_HN(native_cospi);
2010
2011H_FUNC_HN_HN(native_distance);
2012HN_FUNC_HN_HN(native_divide);
2013
2014HN_FUNC_HN(native_exp);
2015HN_FUNC_HN(native_exp10);
2016HN_FUNC_HN(native_exp2);
2017HN_FUNC_HN(native_expm1);
2018
2019HN_FUNC_HN_HN(native_hypot);
2020H_FUNC_HN(native_length);
2021
2022HN_FUNC_HN(native_log);
2023HN_FUNC_HN(native_log10);
2024HN_FUNC_HN(native_log1p);
2025HN_FUNC_HN(native_log2);
2026
2027HN_FUNC_HN(native_normalize);
2028
2029HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half?
2030
2031HN_FUNC_HN(native_recip);
2032HN_FUNC_HN_IN(native_rootn);
2033HN_FUNC_HN(native_rsqrt);
2034
2035HN_FUNC_HN(native_sin);
2036
2037extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) {
2038    return sincos(v, cosptr);
2039}
2040extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) {
2041    return sincos(v, cosptr);
2042}
2043extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) {
2044    return sincos(v, cosptr);
2045}
2046extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) {
2047    return sincos(v, cosptr);
2048}
2049
2050HN_FUNC_HN(native_sinh);
2051HN_FUNC_HN(native_sinpi);
2052HN_FUNC_HN(native_sqrt);
2053
2054HN_FUNC_HN(native_tan);
2055HN_FUNC_HN(native_tanh);
2056HN_FUNC_HN(native_tanpi);
2057
2058#undef HN_FUNC_HN
2059#undef HN_FUNC_HN_HN
2060#undef HN_FUNC_HN_H
2061#undef HN_FUNC_HN_HN_HN
2062#undef HN_FUNC_HN_IN
2063#undef H_FUNC_HN
2064#undef H_FUNC_HN_HN
2065#undef SCALARIZE_HN_FUNC_HN_HN
2066
2067