rs_cl.c revision 9cbc99ba45126a6a30ba13fc6d4e75e51ca14ea7
1#include "rs_core.rsh" 2 3extern float2 __attribute__((overloadable)) convert_float2(int2 c); 4extern float3 __attribute__((overloadable)) convert_float3(int3 c); 5extern float4 __attribute__((overloadable)) convert_float4(int4 c); 6 7extern int2 __attribute__((overloadable)) convert_int2(float2 c); 8extern int3 __attribute__((overloadable)) convert_int3(float3 c); 9extern int4 __attribute__((overloadable)) convert_int4(float4 c); 10 11 12extern float __attribute__((overloadable)) fmin(float v, float v2); 13extern float2 __attribute__((overloadable)) fmin(float2 v, float v2); 14extern float3 __attribute__((overloadable)) fmin(float3 v, float v2); 15extern float4 __attribute__((overloadable)) fmin(float4 v, float v2); 16 17extern float __attribute__((overloadable)) fmax(float v, float v2); 18extern float2 __attribute__((overloadable)) fmax(float2 v, float v2); 19extern float3 __attribute__((overloadable)) fmax(float3 v, float v2); 20extern float4 __attribute__((overloadable)) fmax(float4 v, float v2); 21 22// Float ops, 6.11.2 23 24#define FN_FUNC_FN(fnc) \ 25extern float2 __attribute__((overloadable)) fnc(float2 v) { \ 26 float2 r; \ 27 r.x = fnc(v.x); \ 28 r.y = fnc(v.y); \ 29 return r; \ 30} \ 31extern float3 __attribute__((overloadable)) fnc(float3 v) { \ 32 float3 r; \ 33 r.x = fnc(v.x); \ 34 r.y = fnc(v.y); \ 35 r.z = fnc(v.z); \ 36 return r; \ 37} \ 38extern float4 __attribute__((overloadable)) fnc(float4 v) { \ 39 float4 r; \ 40 r.x = fnc(v.x); \ 41 r.y = fnc(v.y); \ 42 r.z = fnc(v.z); \ 43 r.w = fnc(v.w); \ 44 return r; \ 45} 46 47#define IN_FUNC_FN(fnc) \ 48extern int2 __attribute__((overloadable)) fnc(float2 v) { \ 49 int2 r; \ 50 r.x = fnc(v.x); \ 51 r.y = fnc(v.y); \ 52 return r; \ 53} \ 54extern int3 __attribute__((overloadable)) fnc(float3 v) { \ 55 int3 r; \ 56 r.x = fnc(v.x); \ 57 r.y = fnc(v.y); \ 58 r.z = fnc(v.z); \ 59 return r; \ 60} \ 61extern int4 __attribute__((overloadable)) fnc(float4 v) { \ 62 int4 r; \ 63 r.x = fnc(v.x); \ 64 r.y = fnc(v.y); \ 65 r.z = fnc(v.z); \ 66 r.w = fnc(v.w); \ 67 return r; \ 68} 69 70#define FN_FUNC_FN_FN(fnc) \ 71extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \ 72 float2 r; \ 73 r.x = fnc(v1.x, v2.x); \ 74 r.y = fnc(v1.y, v2.y); \ 75 return r; \ 76} \ 77extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \ 78 float3 r; \ 79 r.x = fnc(v1.x, v2.x); \ 80 r.y = fnc(v1.y, v2.y); \ 81 r.z = fnc(v1.z, v2.z); \ 82 return r; \ 83} \ 84extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \ 85 float4 r; \ 86 r.x = fnc(v1.x, v2.x); \ 87 r.y = fnc(v1.y, v2.y); \ 88 r.z = fnc(v1.z, v2.z); \ 89 r.w = fnc(v1.w, v2.w); \ 90 return r; \ 91} 92 93#define FN_FUNC_FN_F(fnc) \ 94extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \ 95 float2 r; \ 96 r.x = fnc(v1.x, v2); \ 97 r.y = fnc(v1.y, v2); \ 98 return r; \ 99} \ 100extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \ 101 float3 r; \ 102 r.x = fnc(v1.x, v2); \ 103 r.y = fnc(v1.y, v2); \ 104 r.z = fnc(v1.z, v2); \ 105 return r; \ 106} \ 107extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \ 108 float4 r; \ 109 r.x = fnc(v1.x, v2); \ 110 r.y = fnc(v1.y, v2); \ 111 r.z = fnc(v1.z, v2); \ 112 r.w = fnc(v1.w, v2); \ 113 return r; \ 114} 115 116#define FN_FUNC_FN_IN(fnc) \ 117extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \ 118 float2 r; \ 119 r.x = fnc(v1.x, v2.x); \ 120 r.y = fnc(v1.y, v2.y); \ 121 return r; \ 122} \ 123extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \ 124 float3 r; \ 125 r.x = fnc(v1.x, v2.x); \ 126 r.y = fnc(v1.y, v2.y); \ 127 r.z = fnc(v1.z, v2.z); \ 128 return r; \ 129} \ 130extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \ 131 float4 r; \ 132 r.x = fnc(v1.x, v2.x); \ 133 r.y = fnc(v1.y, v2.y); \ 134 r.z = fnc(v1.z, v2.z); \ 135 r.w = fnc(v1.w, v2.w); \ 136 return r; \ 137} 138 139#define FN_FUNC_FN_I(fnc) \ 140extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \ 141 float2 r; \ 142 r.x = fnc(v1.x, v2); \ 143 r.y = fnc(v1.y, v2); \ 144 return r; \ 145} \ 146extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \ 147 float3 r; \ 148 r.x = fnc(v1.x, v2); \ 149 r.y = fnc(v1.y, v2); \ 150 r.z = fnc(v1.z, v2); \ 151 return r; \ 152} \ 153extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \ 154 float4 r; \ 155 r.x = fnc(v1.x, v2); \ 156 r.y = fnc(v1.y, v2); \ 157 r.z = fnc(v1.z, v2); \ 158 r.w = fnc(v1.w, v2); \ 159 return r; \ 160} 161 162#define FN_FUNC_FN_PFN(fnc) \ 163extern float2 __attribute__((overloadable)) \ 164 fnc(float2 v1, float2 *v2) { \ 165 float2 r; \ 166 float t[2]; \ 167 r.x = fnc(v1.x, &t[0]); \ 168 r.y = fnc(v1.y, &t[1]); \ 169 v2->x = t[0]; \ 170 v2->y = t[1]; \ 171 return r; \ 172} \ 173extern float3 __attribute__((overloadable)) \ 174 fnc(float3 v1, float3 *v2) { \ 175 float3 r; \ 176 float t[3]; \ 177 r.x = fnc(v1.x, &t[0]); \ 178 r.y = fnc(v1.y, &t[1]); \ 179 r.z = fnc(v1.z, &t[2]); \ 180 v2->x = t[0]; \ 181 v2->y = t[1]; \ 182 v2->z = t[2]; \ 183 return r; \ 184} \ 185extern float4 __attribute__((overloadable)) \ 186 fnc(float4 v1, float4 *v2) { \ 187 float4 r; \ 188 float t[4]; \ 189 r.x = fnc(v1.x, &t[0]); \ 190 r.y = fnc(v1.y, &t[1]); \ 191 r.z = fnc(v1.z, &t[2]); \ 192 r.w = fnc(v1.w, &t[3]); \ 193 v2->x = t[0]; \ 194 v2->y = t[1]; \ 195 v2->z = t[2]; \ 196 v2->w = t[3]; \ 197 return r; \ 198} 199 200#define FN_FUNC_FN_PIN(fnc) \ 201extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \ 202 float2 r; \ 203 int t[2]; \ 204 r.x = fnc(v1.x, &t[0]); \ 205 r.y = fnc(v1.y, &t[1]); \ 206 v2->x = t[0]; \ 207 v2->y = t[1]; \ 208 return r; \ 209} \ 210extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \ 211 float3 r; \ 212 int t[3]; \ 213 r.x = fnc(v1.x, &t[0]); \ 214 r.y = fnc(v1.y, &t[1]); \ 215 r.z = fnc(v1.z, &t[2]); \ 216 v2->x = t[0]; \ 217 v2->y = t[1]; \ 218 v2->z = t[2]; \ 219 return r; \ 220} \ 221extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \ 222 float4 r; \ 223 int t[4]; \ 224 r.x = fnc(v1.x, &t[0]); \ 225 r.y = fnc(v1.y, &t[1]); \ 226 r.z = fnc(v1.z, &t[2]); \ 227 r.w = fnc(v1.w, &t[3]); \ 228 v2->x = t[0]; \ 229 v2->y = t[1]; \ 230 v2->z = t[2]; \ 231 v2->w = t[3]; \ 232 return r; \ 233} 234 235#define FN_FUNC_FN_FN_FN(fnc) \ 236extern float2 __attribute__((overloadable)) \ 237 fnc(float2 v1, float2 v2, float2 v3) { \ 238 float2 r; \ 239 r.x = fnc(v1.x, v2.x, v3.x); \ 240 r.y = fnc(v1.y, v2.y, v3.y); \ 241 return r; \ 242} \ 243extern float3 __attribute__((overloadable)) \ 244 fnc(float3 v1, float3 v2, float3 v3) { \ 245 float3 r; \ 246 r.x = fnc(v1.x, v2.x, v3.x); \ 247 r.y = fnc(v1.y, v2.y, v3.y); \ 248 r.z = fnc(v1.z, v2.z, v3.z); \ 249 return r; \ 250} \ 251extern float4 __attribute__((overloadable)) \ 252 fnc(float4 v1, float4 v2, float4 v3) { \ 253 float4 r; \ 254 r.x = fnc(v1.x, v2.x, v3.x); \ 255 r.y = fnc(v1.y, v2.y, v3.y); \ 256 r.z = fnc(v1.z, v2.z, v3.z); \ 257 r.w = fnc(v1.w, v2.w, v3.w); \ 258 return r; \ 259} 260 261#define FN_FUNC_FN_FN_PIN(fnc) \ 262extern float2 __attribute__((overloadable)) \ 263 fnc(float2 v1, float2 v2, int2 *v3) { \ 264 float2 r; \ 265 int t[2]; \ 266 r.x = fnc(v1.x, v2.x, &t[0]); \ 267 r.y = fnc(v1.y, v2.y, &t[1]); \ 268 v3->x = t[0]; \ 269 v3->y = t[1]; \ 270 return r; \ 271} \ 272extern float3 __attribute__((overloadable)) \ 273 fnc(float3 v1, float3 v2, int3 *v3) { \ 274 float3 r; \ 275 int t[3]; \ 276 r.x = fnc(v1.x, v2.x, &t[0]); \ 277 r.y = fnc(v1.y, v2.y, &t[1]); \ 278 r.z = fnc(v1.z, v2.z, &t[2]); \ 279 v3->x = t[0]; \ 280 v3->y = t[1]; \ 281 v3->z = t[2]; \ 282 return r; \ 283} \ 284extern float4 __attribute__((overloadable)) \ 285 fnc(float4 v1, float4 v2, int4 *v3) { \ 286 float4 r; \ 287 int t[4]; \ 288 r.x = fnc(v1.x, v2.x, &t[0]); \ 289 r.y = fnc(v1.y, v2.y, &t[1]); \ 290 r.z = fnc(v1.z, v2.z, &t[2]); \ 291 r.w = fnc(v1.w, v2.w, &t[3]); \ 292 v3->x = t[0]; \ 293 v3->y = t[1]; \ 294 v3->z = t[2]; \ 295 v3->w = t[3]; \ 296 return r; \ 297} 298 299static const int iposinf = 0x7f800000; 300static const int ineginf = 0xff800000; 301 302static const float posinf() { 303 float f = *((float*)&iposinf); 304 return f; 305} 306 307static const float neginf() { 308 float f = *((float*)&ineginf); 309 return f; 310} 311 312static bool isinf(float f) { 313 int i = *((int*)(void*)&f); 314 return (i == iposinf) || (i == ineginf); 315} 316 317static bool isnan(float f) { 318 int i = *((int*)(void*)&f); 319 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff)); 320} 321 322static bool isposzero(float f) { 323 int i = *((int*)(void*)&f); 324 return (i == 0x00000000); 325} 326 327static bool isnegzero(float f) { 328 int i = *((int*)(void*)&f); 329 return (i == 0x80000000); 330} 331 332static bool iszero(float f) { 333 return isposzero(f) || isnegzero(f); 334} 335 336 337extern float __attribute__((overloadable)) acos(float); 338FN_FUNC_FN(acos) 339 340extern float __attribute__((overloadable)) acosh(float); 341FN_FUNC_FN(acosh) 342 343 344extern float __attribute__((overloadable)) acospi(float v) { 345 return acos(v) / M_PI; 346} 347FN_FUNC_FN(acospi) 348 349extern float __attribute__((overloadable)) asin(float); 350FN_FUNC_FN(asin) 351 352extern float __attribute__((overloadable)) asinh(float); 353FN_FUNC_FN(asinh) 354 355extern float __attribute__((overloadable)) asinpi(float v) { 356 return asin(v) / M_PI; 357} 358FN_FUNC_FN(asinpi) 359 360extern float __attribute__((overloadable)) atan(float); 361FN_FUNC_FN(atan) 362 363extern float __attribute__((overloadable)) atan2(float, float); 364FN_FUNC_FN_FN(atan2) 365 366extern float __attribute__((overloadable)) atanh(float); 367FN_FUNC_FN(atanh) 368 369extern float __attribute__((overloadable)) atanpi(float v) { 370 return atan(v) / M_PI; 371} 372FN_FUNC_FN(atanpi) 373 374 375extern float __attribute__((overloadable)) atan2pi(float y, float x) { 376 return atan2(y, x) / M_PI; 377} 378FN_FUNC_FN_FN(atan2pi) 379 380extern float __attribute__((overloadable)) cbrt(float); 381FN_FUNC_FN(cbrt) 382 383extern float __attribute__((overloadable)) ceil(float); 384FN_FUNC_FN(ceil) 385 386extern float __attribute__((overloadable)) copysign(float, float); 387FN_FUNC_FN_FN(copysign) 388 389extern float __attribute__((overloadable)) cos(float); 390FN_FUNC_FN(cos) 391 392extern float __attribute__((overloadable)) cosh(float); 393FN_FUNC_FN(cosh) 394 395extern float __attribute__((overloadable)) cospi(float v) { 396 return cos(v * M_PI); 397} 398FN_FUNC_FN(cospi) 399 400extern float __attribute__((overloadable)) erfc(float); 401FN_FUNC_FN(erfc) 402 403extern float __attribute__((overloadable)) erf(float); 404FN_FUNC_FN(erf) 405 406extern float __attribute__((overloadable)) exp(float); 407FN_FUNC_FN(exp) 408 409extern float __attribute__((overloadable)) exp2(float); 410FN_FUNC_FN(exp2) 411 412extern float __attribute__((overloadable)) pow(float, float); 413 414extern float __attribute__((overloadable)) exp10(float v) { 415 return exp2(v * 3.321928095f); 416} 417FN_FUNC_FN(exp10) 418 419extern float __attribute__((overloadable)) expm1(float); 420FN_FUNC_FN(expm1) 421 422extern float __attribute__((overloadable)) fabs(float v) { 423 int i = *((int*)(void*)&v) & 0x7fffffff; 424 return *((float*)(void*)&i); 425} 426FN_FUNC_FN(fabs) 427 428extern float __attribute__((overloadable)) fdim(float, float); 429FN_FUNC_FN_FN(fdim) 430 431extern float __attribute__((overloadable)) floor(float); 432FN_FUNC_FN(floor) 433 434extern float __attribute__((overloadable)) fma(float, float, float); 435FN_FUNC_FN_FN_FN(fma) 436 437extern float __attribute__((overloadable)) fmin(float, float); 438 439extern float __attribute__((overloadable)) fmod(float, float); 440FN_FUNC_FN_FN(fmod) 441 442extern float __attribute__((overloadable)) fract(float v, float *iptr) { 443 int i = (int)floor(v); 444 if (iptr) { 445 iptr[0] = i; 446 } 447 return fmin(v - i, 0x1.fffffep-1f); 448} 449FN_FUNC_FN_PFN(fract) 450 451extern float __attribute__((const, overloadable)) fract(float v) { 452 float unused; 453 return fract(v, &unused); 454} 455FN_FUNC_FN(fract) 456 457extern float __attribute__((overloadable)) frexp(float, int *); 458FN_FUNC_FN_PIN(frexp) 459 460extern float __attribute__((overloadable)) hypot(float, float); 461FN_FUNC_FN_FN(hypot) 462 463extern int __attribute__((overloadable)) ilogb(float); 464IN_FUNC_FN(ilogb) 465 466extern float __attribute__((overloadable)) ldexp(float, int); 467FN_FUNC_FN_IN(ldexp) 468FN_FUNC_FN_I(ldexp) 469 470extern float __attribute__((overloadable)) lgamma(float); 471FN_FUNC_FN(lgamma) 472extern float __attribute__((overloadable)) lgamma(float, int*); 473FN_FUNC_FN_PIN(lgamma) 474 475extern float __attribute__((overloadable)) log(float); 476FN_FUNC_FN(log) 477 478extern float __attribute__((overloadable)) log10(float); 479FN_FUNC_FN(log10) 480 481 482extern float __attribute__((overloadable)) log2(float v) { 483 return log10(v) * 3.321928095f; 484} 485FN_FUNC_FN(log2) 486 487extern float __attribute__((overloadable)) log1p(float); 488FN_FUNC_FN(log1p) 489 490extern float __attribute__((overloadable)) logb(float); 491FN_FUNC_FN(logb) 492 493extern float __attribute__((overloadable)) mad(float a, float b, float c) { 494 return a * b + c; 495} 496extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) { 497 return a * b + c; 498} 499extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) { 500 return a * b + c; 501} 502extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) { 503 return a * b + c; 504} 505 506extern float __attribute__((overloadable)) modf(float, float *); 507FN_FUNC_FN_PFN(modf); 508 509extern float __attribute__((overloadable)) nan(uint v) { 510 float f[1]; 511 uint32_t *ip = (uint32_t *)f; 512 *ip = v | 0x7fc00000; 513 return f[0]; 514} 515 516extern float __attribute__((overloadable)) nextafter(float, float); 517FN_FUNC_FN_FN(nextafter) 518 519FN_FUNC_FN_FN(pow) 520 521extern float __attribute__((overloadable)) pown(float v, int p) { 522 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31). 523 * For very large ints, we'll lose whether the exponent is even or odd, making 524 * the selection of a correct sign incorrect. We correct this. Use copysign 525 * to handle the negative zero case. 526 */ 527 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f; 528 float f = pow(v, (float)p); 529 return copysign(f, sign); 530} 531FN_FUNC_FN_IN(pown) 532 533extern float __attribute__((overloadable)) powr(float v, float p) { 534 return pow(v, p); 535} 536extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) { 537 return pow(v, p); 538} 539extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) { 540 return pow(v, p); 541} 542extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) { 543 return pow(v, p); 544} 545 546extern float __attribute__((overloadable)) remainder(float, float); 547FN_FUNC_FN_FN(remainder) 548 549extern float __attribute__((overloadable)) remquo(float, float, int *); 550FN_FUNC_FN_FN_PIN(remquo) 551 552extern float __attribute__((overloadable)) rint(float); 553FN_FUNC_FN(rint) 554 555extern float __attribute__((overloadable)) rootn(float v, int r) { 556 if (r == 0) { 557 return posinf(); 558 } 559 560 if (iszero(v)) { 561 if (r < 0) { 562 if (r & 1) { 563 return copysign(posinf(), v); 564 } else { 565 return posinf(); 566 } 567 } else { 568 if (r & 1) { 569 return copysign(0.f, v); 570 } else { 571 return 0.f; 572 } 573 } 574 } 575 576 if (!isinf(v) && !isnan(v) && (v < 0.f)) { 577 if (r & 1) { 578 return (-1.f * pow(-1.f * v, 1.f / r)); 579 } else { 580 return nan(0); 581 } 582 } 583 584 return pow(v, 1.f / r); 585} 586FN_FUNC_FN_IN(rootn); 587 588extern float __attribute__((overloadable)) round(float); 589FN_FUNC_FN(round) 590 591 592extern float __attribute__((overloadable)) sqrt(float); 593extern float __attribute__((overloadable)) rsqrt(float v) { 594 return 1.f / sqrt(v); 595} 596 597#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 598// These functions must be defined here if we are not using the SSE 599// implementation, which includes when we are built as part of the 600// debug runtime (libclcore_debug.bc). 601FN_FUNC_FN(sqrt) 602#else 603extern float2 __attribute__((overloadable)) sqrt(float2); 604extern float3 __attribute__((overloadable)) sqrt(float3); 605extern float4 __attribute__((overloadable)) sqrt(float4); 606#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 607 608FN_FUNC_FN(rsqrt) 609 610extern float __attribute__((overloadable)) sin(float); 611FN_FUNC_FN(sin) 612 613extern float __attribute__((overloadable)) sincos(float v, float *cosptr) { 614 *cosptr = cos(v); 615 return sin(v); 616} 617extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) { 618 *cosptr = cos(v); 619 return sin(v); 620} 621extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) { 622 *cosptr = cos(v); 623 return sin(v); 624} 625extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) { 626 *cosptr = cos(v); 627 return sin(v); 628} 629 630extern float __attribute__((overloadable)) sinh(float); 631FN_FUNC_FN(sinh) 632 633extern float __attribute__((overloadable)) sinpi(float v) { 634 return sin(v * M_PI); 635} 636FN_FUNC_FN(sinpi) 637 638extern float __attribute__((overloadable)) tan(float); 639FN_FUNC_FN(tan) 640 641extern float __attribute__((overloadable)) tanh(float); 642FN_FUNC_FN(tanh) 643 644extern float __attribute__((overloadable)) tanpi(float v) { 645 return tan(v * M_PI); 646} 647FN_FUNC_FN(tanpi) 648 649 650extern float __attribute__((overloadable)) tgamma(float); 651FN_FUNC_FN(tgamma) 652 653extern float __attribute__((overloadable)) trunc(float); 654FN_FUNC_FN(trunc) 655 656// Int ops (partial), 6.11.3 657 658#define XN_FUNC_YN(typeout, fnc, typein) \ 659extern typeout __attribute__((overloadable)) fnc(typein); \ 660extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \ 661 typeout##2 r; \ 662 r.x = fnc(v.x); \ 663 r.y = fnc(v.y); \ 664 return r; \ 665} \ 666extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \ 667 typeout##3 r; \ 668 r.x = fnc(v.x); \ 669 r.y = fnc(v.y); \ 670 r.z = fnc(v.z); \ 671 return r; \ 672} \ 673extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \ 674 typeout##4 r; \ 675 r.x = fnc(v.x); \ 676 r.y = fnc(v.y); \ 677 r.z = fnc(v.z); \ 678 r.w = fnc(v.w); \ 679 return r; \ 680} 681 682 683#define UIN_FUNC_IN(fnc) \ 684XN_FUNC_YN(uchar, fnc, char) \ 685XN_FUNC_YN(ushort, fnc, short) \ 686XN_FUNC_YN(uint, fnc, int) 687 688#define IN_FUNC_IN(fnc) \ 689XN_FUNC_YN(uchar, fnc, uchar) \ 690XN_FUNC_YN(char, fnc, char) \ 691XN_FUNC_YN(ushort, fnc, ushort) \ 692XN_FUNC_YN(short, fnc, short) \ 693XN_FUNC_YN(uint, fnc, uint) \ 694XN_FUNC_YN(int, fnc, int) 695 696 697#define XN_FUNC_XN_XN_BODY(type, fnc, body) \ 698extern type __attribute__((overloadable)) \ 699 fnc(type v1, type v2) { \ 700 return body; \ 701} \ 702extern type##2 __attribute__((overloadable)) \ 703 fnc(type##2 v1, type##2 v2) { \ 704 type##2 r; \ 705 r.x = fnc(v1.x, v2.x); \ 706 r.y = fnc(v1.y, v2.y); \ 707 return r; \ 708} \ 709extern type##3 __attribute__((overloadable)) \ 710 fnc(type##3 v1, type##3 v2) { \ 711 type##3 r; \ 712 r.x = fnc(v1.x, v2.x); \ 713 r.y = fnc(v1.y, v2.y); \ 714 r.z = fnc(v1.z, v2.z); \ 715 return r; \ 716} \ 717extern type##4 __attribute__((overloadable)) \ 718 fnc(type##4 v1, type##4 v2) { \ 719 type##4 r; \ 720 r.x = fnc(v1.x, v2.x); \ 721 r.y = fnc(v1.y, v2.y); \ 722 r.z = fnc(v1.z, v2.z); \ 723 r.w = fnc(v1.w, v2.w); \ 724 return r; \ 725} 726 727#define IN_FUNC_IN_IN_BODY(fnc, body) \ 728XN_FUNC_XN_XN_BODY(uchar, fnc, body) \ 729XN_FUNC_XN_XN_BODY(char, fnc, body) \ 730XN_FUNC_XN_XN_BODY(ushort, fnc, body) \ 731XN_FUNC_XN_XN_BODY(short, fnc, body) \ 732XN_FUNC_XN_XN_BODY(uint, fnc, body) \ 733XN_FUNC_XN_XN_BODY(int, fnc, body) \ 734XN_FUNC_XN_XN_BODY(float, fnc, body) 735 736 737/** 738 * abs 739 */ 740extern uint32_t __attribute__((overloadable)) abs(int32_t v) { 741 if (v < 0) 742 return -v; 743 return v; 744} 745extern uint16_t __attribute__((overloadable)) abs(int16_t v) { 746 if (v < 0) 747 return -v; 748 return v; 749} 750extern uint8_t __attribute__((overloadable)) abs(int8_t v) { 751 if (v < 0) 752 return -v; 753 return v; 754} 755 756/** 757 * clz 758 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be 759 * expanded to 32 bits. For our smaller data types, we need to subtract off 760 * these unused top bits (that will be always be composed of zeros). 761 */ 762extern uint32_t __attribute__((overloadable)) clz(uint32_t v) { 763 return __builtin_clz(v); 764} 765extern uint16_t __attribute__((overloadable)) clz(uint16_t v) { 766 return __builtin_clz(v) - 16; 767} 768extern uint8_t __attribute__((overloadable)) clz(uint8_t v) { 769 return __builtin_clz(v) - 24; 770} 771extern int32_t __attribute__((overloadable)) clz(int32_t v) { 772 return __builtin_clz(v); 773} 774extern int16_t __attribute__((overloadable)) clz(int16_t v) { 775 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16; 776} 777extern int8_t __attribute__((overloadable)) clz(int8_t v) { 778 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24; 779} 780 781 782UIN_FUNC_IN(abs) 783IN_FUNC_IN(clz) 784 785 786// 6.11.4 787 788 789extern float __attribute__((overloadable)) degrees(float radians) { 790 return radians * (180.f / M_PI); 791} 792extern float2 __attribute__((overloadable)) degrees(float2 radians) { 793 return radians * (180.f / M_PI); 794} 795extern float3 __attribute__((overloadable)) degrees(float3 radians) { 796 return radians * (180.f / M_PI); 797} 798extern float4 __attribute__((overloadable)) degrees(float4 radians) { 799 return radians * (180.f / M_PI); 800} 801 802extern float __attribute__((overloadable)) mix(float start, float stop, float amount) { 803 return start + (stop - start) * amount; 804} 805extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) { 806 return start + (stop - start) * amount; 807} 808extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) { 809 return start + (stop - start) * amount; 810} 811extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) { 812 return start + (stop - start) * amount; 813} 814extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) { 815 return start + (stop - start) * amount; 816} 817extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) { 818 return start + (stop - start) * amount; 819} 820extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) { 821 return start + (stop - start) * amount; 822} 823 824extern float __attribute__((overloadable)) radians(float degrees) { 825 return degrees * (M_PI / 180.f); 826} 827extern float2 __attribute__((overloadable)) radians(float2 degrees) { 828 return degrees * (M_PI / 180.f); 829} 830extern float3 __attribute__((overloadable)) radians(float3 degrees) { 831 return degrees * (M_PI / 180.f); 832} 833extern float4 __attribute__((overloadable)) radians(float4 degrees) { 834 return degrees * (M_PI / 180.f); 835} 836 837extern float __attribute__((overloadable)) step(float edge, float v) { 838 return (v < edge) ? 0.f : 1.f; 839} 840extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) { 841 float2 r; 842 r.x = (v.x < edge.x) ? 0.f : 1.f; 843 r.y = (v.y < edge.y) ? 0.f : 1.f; 844 return r; 845} 846extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) { 847 float3 r; 848 r.x = (v.x < edge.x) ? 0.f : 1.f; 849 r.y = (v.y < edge.y) ? 0.f : 1.f; 850 r.z = (v.z < edge.z) ? 0.f : 1.f; 851 return r; 852} 853extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) { 854 float4 r; 855 r.x = (v.x < edge.x) ? 0.f : 1.f; 856 r.y = (v.y < edge.y) ? 0.f : 1.f; 857 r.z = (v.z < edge.z) ? 0.f : 1.f; 858 r.w = (v.w < edge.w) ? 0.f : 1.f; 859 return r; 860} 861extern float2 __attribute__((overloadable)) step(float2 edge, float v) { 862 float2 r; 863 r.x = (v < edge.x) ? 0.f : 1.f; 864 r.y = (v < edge.y) ? 0.f : 1.f; 865 return r; 866} 867extern float3 __attribute__((overloadable)) step(float3 edge, float v) { 868 float3 r; 869 r.x = (v < edge.x) ? 0.f : 1.f; 870 r.y = (v < edge.y) ? 0.f : 1.f; 871 r.z = (v < edge.z) ? 0.f : 1.f; 872 return r; 873} 874extern float4 __attribute__((overloadable)) step(float4 edge, float v) { 875 float4 r; 876 r.x = (v < edge.x) ? 0.f : 1.f; 877 r.y = (v < edge.y) ? 0.f : 1.f; 878 r.z = (v < edge.z) ? 0.f : 1.f; 879 r.w = (v < edge.w) ? 0.f : 1.f; 880 return r; 881} 882extern float2 __attribute__((overloadable)) step(float edge, float2 v) { 883 float2 r; 884 r.x = (v.x < edge) ? 0.f : 1.f; 885 r.y = (v.y < edge) ? 0.f : 1.f; 886 return r; 887} 888extern float3 __attribute__((overloadable)) step(float edge, float3 v) { 889 float3 r; 890 r.x = (v.x < edge) ? 0.f : 1.f; 891 r.y = (v.y < edge) ? 0.f : 1.f; 892 r.z = (v.z < edge) ? 0.f : 1.f; 893 return r; 894} 895extern float4 __attribute__((overloadable)) step(float edge, float4 v) { 896 float4 r; 897 r.x = (v.x < edge) ? 0.f : 1.f; 898 r.y = (v.y < edge) ? 0.f : 1.f; 899 r.z = (v.z < edge) ? 0.f : 1.f; 900 r.w = (v.w < edge) ? 0.f : 1.f; 901 return r; 902} 903 904extern float __attribute__((overloadable)) sign(float v) { 905 if (v > 0) return 1.f; 906 if (v < 0) return -1.f; 907 return v; 908} 909FN_FUNC_FN(sign) 910 911 912// 6.11.5 913extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) { 914 float3 r; 915 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 916 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 917 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 918 return r; 919} 920 921extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) { 922 float4 r; 923 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 924 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 925 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 926 r.w = 0.f; 927 return r; 928} 929 930#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 931// These functions must be defined here if we are not using the SSE 932// implementation, which includes when we are built as part of the 933// debug runtime (libclcore_debug.bc). 934 935extern float __attribute__((overloadable)) dot(float lhs, float rhs) { 936 return lhs * rhs; 937} 938extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) { 939 return lhs.x*rhs.x + lhs.y*rhs.y; 940} 941extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) { 942 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z; 943} 944extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) { 945 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w; 946} 947 948extern float __attribute__((overloadable)) length(float v) { 949 return fabs(v); 950} 951extern float __attribute__((overloadable)) length(float2 v) { 952 return sqrt(v.x*v.x + v.y*v.y); 953} 954extern float __attribute__((overloadable)) length(float3 v) { 955 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 956} 957extern float __attribute__((overloadable)) length(float4 v) { 958 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 959} 960 961#else 962 963extern float __attribute__((overloadable)) length(float v); 964extern float __attribute__((overloadable)) length(float2 v); 965extern float __attribute__((overloadable)) length(float3 v); 966extern float __attribute__((overloadable)) length(float4 v); 967 968#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 969 970extern float __attribute__((overloadable)) distance(float lhs, float rhs) { 971 return length(lhs - rhs); 972} 973extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) { 974 return length(lhs - rhs); 975} 976extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) { 977 return length(lhs - rhs); 978} 979extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) { 980 return length(lhs - rhs); 981} 982 983/* For the normalization functions, vectors of length 0 should simply be 984 * returned (i.e. all the components of that vector are 0). 985 */ 986extern float __attribute__((overloadable)) normalize(float v) { 987 if (v == 0.0f) { 988 return 0.0f; 989 } else if (v < 0.0f) { 990 return -1.0f; 991 } else { 992 return 1.0f; 993 } 994} 995extern float2 __attribute__((overloadable)) normalize(float2 v) { 996 float l = length(v); 997 return l == 0.0f ? v : v / l; 998} 999extern float3 __attribute__((overloadable)) normalize(float3 v) { 1000 float l = length(v); 1001 return l == 0.0f ? v : v / l; 1002} 1003extern float4 __attribute__((overloadable)) normalize(float4 v) { 1004 float l = length(v); 1005 return l == 0.0f ? v : v / l; 1006} 1007 1008extern float __attribute__((overloadable)) half_sqrt(float v) { 1009 return sqrt(v); 1010} 1011FN_FUNC_FN(half_sqrt) 1012 1013extern float __attribute__((overloadable)) fast_length(float v) { 1014 return fabs(v); 1015} 1016extern float __attribute__((overloadable)) fast_length(float2 v) { 1017 return half_sqrt(v.x*v.x + v.y*v.y); 1018} 1019extern float __attribute__((overloadable)) fast_length(float3 v) { 1020 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1021} 1022extern float __attribute__((overloadable)) fast_length(float4 v) { 1023 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1024} 1025 1026extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) { 1027 return fast_length(lhs - rhs); 1028} 1029extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) { 1030 return fast_length(lhs - rhs); 1031} 1032extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) { 1033 return fast_length(lhs - rhs); 1034} 1035extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) { 1036 return fast_length(lhs - rhs); 1037} 1038 1039extern float __attribute__((overloadable)) half_rsqrt(float); 1040 1041/* For the normalization functions, vectors of length 0 should simply be 1042 * returned (i.e. all the components of that vector are 0). 1043 */ 1044extern float __attribute__((overloadable)) fast_normalize(float v) { 1045 if (v == 0.0f) { 1046 return 0.0f; 1047 } else if (v < 0.0f) { 1048 return -1.0f; 1049 } else { 1050 return 1.0f; 1051 } 1052} 1053// If the length is 0, then rlength should be NaN. 1054extern float2 __attribute__((overloadable)) fast_normalize(float2 v) { 1055 float rlength = half_rsqrt(v.x*v.x + v.y*v.y); 1056 return (rlength == rlength) ? v * rlength : v; 1057} 1058extern float3 __attribute__((overloadable)) fast_normalize(float3 v) { 1059 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1060 return (rlength == rlength) ? v * rlength : v; 1061} 1062extern float4 __attribute__((overloadable)) fast_normalize(float4 v) { 1063 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1064 return (rlength == rlength) ? v * rlength : v; 1065} 1066 1067extern float __attribute__((overloadable)) half_recip(float v) { 1068 return 1.f / v; 1069} 1070 1071/* 1072extern float __attribute__((overloadable)) approx_atan(float x) { 1073 if (x == 0.f) 1074 return 0.f; 1075 if (x < 0.f) 1076 return -1.f * approx_atan(-1.f * x); 1077 if (x > 1.f) 1078 return M_PI_2 - approx_atan(approx_recip(x)); 1079 return x * approx_recip(1.f + 0.28f * x*x); 1080} 1081FN_FUNC_FN(approx_atan) 1082*/ 1083 1084typedef union 1085{ 1086 float fv; 1087 int32_t iv; 1088} ieee_float_shape_type; 1089 1090/* Get a 32 bit int from a float. */ 1091 1092#define GET_FLOAT_WORD(i,d) \ 1093do { \ 1094 ieee_float_shape_type gf_u; \ 1095 gf_u.fv = (d); \ 1096 (i) = gf_u.iv; \ 1097} while (0) 1098 1099/* Set a float from a 32 bit int. */ 1100 1101#define SET_FLOAT_WORD(d,i) \ 1102do { \ 1103 ieee_float_shape_type sf_u; \ 1104 sf_u.iv = (i); \ 1105 (d) = sf_u.fv; \ 1106} while (0) 1107 1108 1109 1110// Valid -125 to 125 1111extern float __attribute__((overloadable)) native_exp2(float v) { 1112 int32_t iv = (int)v; 1113 int32_t x = iv + (iv >> 31); // ~floor(v) 1114 float r = (v - x); 1115 1116 float fo; 1117 SET_FLOAT_WORD(fo, (x + 127) << 23); 1118 1119 r *= 0.694f; // ~ log(e) / log(2) 1120 float r2 = r*r; 1121 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1122 return fo * adj; 1123} 1124 1125extern float2 __attribute__((overloadable)) native_exp2(float2 v) { 1126 int2 iv = convert_int2(v); 1127 int2 x = iv + (iv >> (int2)31);//floor(v); 1128 float2 r = (v - convert_float2(x)); 1129 1130 x += 127; 1131 1132 float2 fo = (float2)(x << (int2)23); 1133 1134 r *= 0.694f; // ~ log(e) / log(2) 1135 float2 r2 = r*r; 1136 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1137 return fo * adj; 1138} 1139 1140extern float4 __attribute__((overloadable)) native_exp2(float4 v) { 1141 int4 iv = convert_int4(v); 1142 int4 x = iv + (iv >> (int4)31);//floor(v); 1143 float4 r = (v - convert_float4(x)); 1144 1145 x += 127; 1146 1147 float4 fo = (float4)(x << (int4)23); 1148 1149 r *= 0.694f; // ~ log(e) / log(2) 1150 float4 r2 = r*r; 1151 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1152 return fo * adj; 1153} 1154 1155extern float3 __attribute__((overloadable)) native_exp2(float3 v) { 1156 float4 t = 1.f; 1157 t.xyz = v; 1158 return native_exp2(t).xyz; 1159} 1160 1161 1162extern float __attribute__((overloadable)) native_exp(float v) { 1163 return native_exp2(v * 1.442695041f); 1164} 1165extern float2 __attribute__((overloadable)) native_exp(float2 v) { 1166 return native_exp2(v * 1.442695041f); 1167} 1168extern float3 __attribute__((overloadable)) native_exp(float3 v) { 1169 return native_exp2(v * 1.442695041f); 1170} 1171extern float4 __attribute__((overloadable)) native_exp(float4 v) { 1172 return native_exp2(v * 1.442695041f); 1173} 1174 1175extern float __attribute__((overloadable)) native_exp10(float v) { 1176 return native_exp2(v * 3.321928095f); 1177} 1178extern float2 __attribute__((overloadable)) native_exp10(float2 v) { 1179 return native_exp2(v * 3.321928095f); 1180} 1181extern float3 __attribute__((overloadable)) native_exp10(float3 v) { 1182 return native_exp2(v * 3.321928095f); 1183} 1184extern float4 __attribute__((overloadable)) native_exp10(float4 v) { 1185 return native_exp2(v * 3.321928095f); 1186} 1187 1188extern float __attribute__((overloadable)) native_log2(float v) { 1189 int32_t ibits; 1190 GET_FLOAT_WORD(ibits, v); 1191 1192 int32_t e = (ibits >> 23) & 0xff; 1193 1194 ibits &= 0x7fffff; 1195 ibits |= 127 << 23; 1196 1197 float ir; 1198 SET_FLOAT_WORD(ir, ibits); 1199 ir -= 1.5f; 1200 float ir2 = ir*ir; 1201 float adj2 = (0.405465108f / 0.693147181f) + 1202 ((0.666666667f / 0.693147181f) * ir) - 1203 ((0.222222222f / 0.693147181f) * ir2) + 1204 ((0.098765432f / 0.693147181f) * ir*ir2) - 1205 ((0.049382716f / 0.693147181f) * ir2*ir2) + 1206 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) - 1207 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2); 1208 return (float)(e - 127) + adj2; 1209} 1210extern float2 __attribute__((overloadable)) native_log2(float2 v) { 1211 float2 v2 = {native_log2(v.x), native_log2(v.y)}; 1212 return v2; 1213} 1214extern float3 __attribute__((overloadable)) native_log2(float3 v) { 1215 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)}; 1216 return v2; 1217} 1218extern float4 __attribute__((overloadable)) native_log2(float4 v) { 1219 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)}; 1220 return v2; 1221} 1222 1223extern float __attribute__((overloadable)) native_log(float v) { 1224 return native_log2(v) * (1.f / 1.442695041f); 1225} 1226extern float2 __attribute__((overloadable)) native_log(float2 v) { 1227 return native_log2(v) * (1.f / 1.442695041f); 1228} 1229extern float3 __attribute__((overloadable)) native_log(float3 v) { 1230 return native_log2(v) * (1.f / 1.442695041f); 1231} 1232extern float4 __attribute__((overloadable)) native_log(float4 v) { 1233 return native_log2(v) * (1.f / 1.442695041f); 1234} 1235 1236extern float __attribute__((overloadable)) native_log10(float v) { 1237 return native_log2(v) * (1.f / 3.321928095f); 1238} 1239extern float2 __attribute__((overloadable)) native_log10(float2 v) { 1240 return native_log2(v) * (1.f / 3.321928095f); 1241} 1242extern float3 __attribute__((overloadable)) native_log10(float3 v) { 1243 return native_log2(v) * (1.f / 3.321928095f); 1244} 1245extern float4 __attribute__((overloadable)) native_log10(float4 v) { 1246 return native_log2(v) * (1.f / 3.321928095f); 1247} 1248 1249 1250extern float __attribute__((overloadable)) native_powr(float v, float y) { 1251 float v2 = native_log2(v); 1252 v2 = fmax(v2 * y, -125.f); 1253 return native_exp2(v2); 1254} 1255extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) { 1256 float2 v2 = native_log2(v); 1257 v2 = fmax(v2 * y, -125.f); 1258 return native_exp2(v2); 1259} 1260extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) { 1261 float3 v2 = native_log2(v); 1262 v2 = fmax(v2 * y, -125.f); 1263 return native_exp2(v2); 1264} 1265extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) { 1266 float4 v2 = native_log2(v); 1267 v2 = fmax(v2 * y, -125.f); 1268 return native_exp2(v2); 1269} 1270 1271extern double __attribute__((overloadable)) min(double v1, double v2) { 1272 return v1 < v2 ? v1 : v2; 1273} 1274 1275extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) { 1276 double2 r; 1277 r.x = v1.x < v2.x ? v1.x : v2.x; 1278 r.y = v1.y < v2.y ? v1.y : v2.y; 1279 return r; 1280} 1281 1282extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) { 1283 double3 r; 1284 r.x = v1.x < v2.x ? v1.x : v2.x; 1285 r.y = v1.y < v2.y ? v1.y : v2.y; 1286 r.z = v1.z < v2.z ? v1.z : v2.z; 1287 return r; 1288} 1289 1290extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) { 1291 double4 r; 1292 r.x = v1.x < v2.x ? v1.x : v2.x; 1293 r.y = v1.y < v2.y ? v1.y : v2.y; 1294 r.z = v1.z < v2.z ? v1.z : v2.z; 1295 r.w = v1.w < v2.w ? v1.w : v2.w; 1296 return r; 1297} 1298 1299extern long __attribute__((overloadable)) min(long v1, long v2) { 1300 return v1 < v2 ? v1 : v2; 1301} 1302extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) { 1303 long2 r; 1304 r.x = v1.x < v2.x ? v1.x : v2.x; 1305 r.y = v1.y < v2.y ? v1.y : v2.y; 1306 return r; 1307} 1308extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) { 1309 long3 r; 1310 r.x = v1.x < v2.x ? v1.x : v2.x; 1311 r.y = v1.y < v2.y ? v1.y : v2.y; 1312 r.z = v1.z < v2.z ? v1.z : v2.z; 1313 return r; 1314} 1315extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) { 1316 long4 r; 1317 r.x = v1.x < v2.x ? v1.x : v2.x; 1318 r.y = v1.y < v2.y ? v1.y : v2.y; 1319 r.z = v1.z < v2.z ? v1.z : v2.z; 1320 r.w = v1.w < v2.w ? v1.w : v2.w; 1321 return r; 1322} 1323 1324extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) { 1325 return v1 < v2 ? v1 : v2; 1326} 1327extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) { 1328 ulong2 r; 1329 r.x = v1.x < v2.x ? v1.x : v2.x; 1330 r.y = v1.y < v2.y ? v1.y : v2.y; 1331 return r; 1332} 1333extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) { 1334 ulong3 r; 1335 r.x = v1.x < v2.x ? v1.x : v2.x; 1336 r.y = v1.y < v2.y ? v1.y : v2.y; 1337 r.z = v1.z < v2.z ? v1.z : v2.z; 1338 return r; 1339} 1340extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) { 1341 ulong4 r; 1342 r.x = v1.x < v2.x ? v1.x : v2.x; 1343 r.y = v1.y < v2.y ? v1.y : v2.y; 1344 r.z = v1.z < v2.z ? v1.z : v2.z; 1345 r.w = v1.w < v2.w ? v1.w : v2.w; 1346 return r; 1347} 1348 1349extern double __attribute__((overloadable)) max(double v1, double v2) { 1350 return v1 > v2 ? v1 : v2; 1351} 1352 1353extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) { 1354 double2 r; 1355 r.x = v1.x > v2.x ? v1.x : v2.x; 1356 r.y = v1.y > v2.y ? v1.y : v2.y; 1357 return r; 1358} 1359 1360extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) { 1361 double3 r; 1362 r.x = v1.x > v2.x ? v1.x : v2.x; 1363 r.y = v1.y > v2.y ? v1.y : v2.y; 1364 r.z = v1.z > v2.z ? v1.z : v2.z; 1365 return r; 1366} 1367 1368extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) { 1369 double4 r; 1370 r.x = v1.x > v2.x ? v1.x : v2.x; 1371 r.y = v1.y > v2.y ? v1.y : v2.y; 1372 r.z = v1.z > v2.z ? v1.z : v2.z; 1373 r.w = v1.w > v2.w ? v1.w : v2.w; 1374 return r; 1375} 1376 1377extern long __attribute__((overloadable)) max(long v1, long v2) { 1378 return v1 > v2 ? v1 : v2; 1379} 1380extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) { 1381 long2 r; 1382 r.x = v1.x > v2.x ? v1.x : v2.x; 1383 r.y = v1.y > v2.y ? v1.y : v2.y; 1384 return r; 1385} 1386extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) { 1387 long3 r; 1388 r.x = v1.x > v2.x ? v1.x : v2.x; 1389 r.y = v1.y > v2.y ? v1.y : v2.y; 1390 r.z = v1.z > v2.z ? v1.z : v2.z; 1391 return r; 1392} 1393extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) { 1394 long4 r; 1395 r.x = v1.x > v2.x ? v1.x : v2.x; 1396 r.y = v1.y > v2.y ? v1.y : v2.y; 1397 r.z = v1.z > v2.z ? v1.z : v2.z; 1398 r.w = v1.w > v2.w ? v1.w : v2.w; 1399 return r; 1400} 1401 1402extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) { 1403 return v1 > v2 ? v1 : v2; 1404} 1405extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) { 1406 ulong2 r; 1407 r.x = v1.x > v2.x ? v1.x : v2.x; 1408 r.y = v1.y > v2.y ? v1.y : v2.y; 1409 return r; 1410} 1411extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) { 1412 ulong3 r; 1413 r.x = v1.x > v2.x ? v1.x : v2.x; 1414 r.y = v1.y > v2.y ? v1.y : v2.y; 1415 r.z = v1.z > v2.z ? v1.z : v2.z; 1416 return r; 1417} 1418extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) { 1419 ulong4 r; 1420 r.x = v1.x > v2.x ? v1.x : v2.x; 1421 r.y = v1.y > v2.y ? v1.y : v2.y; 1422 r.z = v1.z > v2.z ? v1.z : v2.z; 1423 r.w = v1.w > v2.w ? v1.w : v2.w; 1424 return r; 1425} 1426 1427#define THUNK_NATIVE_F(fn) \ 1428 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \ 1429 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \ 1430 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \ 1431 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);} 1432 1433#define THUNK_NATIVE_F_F(fn) \ 1434 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \ 1435 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \ 1436 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \ 1437 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);} 1438 1439#define THUNK_NATIVE_F_FP(fn) \ 1440 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \ 1441 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \ 1442 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \ 1443 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);} 1444 1445#define THUNK_NATIVE_F_I(fn) \ 1446 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \ 1447 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \ 1448 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \ 1449 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);} 1450 1451THUNK_NATIVE_F(acos) 1452THUNK_NATIVE_F(acosh) 1453THUNK_NATIVE_F(acospi) 1454THUNK_NATIVE_F(asin) 1455THUNK_NATIVE_F(asinh) 1456THUNK_NATIVE_F(asinpi) 1457THUNK_NATIVE_F(atan) 1458THUNK_NATIVE_F_F(atan2) 1459THUNK_NATIVE_F(atanh) 1460THUNK_NATIVE_F(atanpi) 1461THUNK_NATIVE_F_F(atan2pi) 1462THUNK_NATIVE_F(cbrt) 1463THUNK_NATIVE_F(cos) 1464THUNK_NATIVE_F(cosh) 1465THUNK_NATIVE_F(cospi) 1466THUNK_NATIVE_F(expm1) 1467THUNK_NATIVE_F_F(hypot) 1468THUNK_NATIVE_F(log1p) 1469THUNK_NATIVE_F_I(rootn) 1470THUNK_NATIVE_F(rsqrt) 1471THUNK_NATIVE_F(sqrt) 1472THUNK_NATIVE_F(sin) 1473THUNK_NATIVE_F_FP(sincos) 1474THUNK_NATIVE_F(sinh) 1475THUNK_NATIVE_F(sinpi) 1476THUNK_NATIVE_F(tan) 1477THUNK_NATIVE_F(tanh) 1478THUNK_NATIVE_F(tanpi) 1479 1480#undef THUNK_NATIVE_F 1481#undef THUNK_NATIVE_F_F 1482#undef THUNK_NATIVE_F_I 1483#undef THUNK_NATIVE_F_FP 1484 1485float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);} 1486float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);} 1487float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);} 1488float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);} 1489 1490float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);} 1491float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);} 1492float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);} 1493float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);} 1494 1495float __attribute__((overloadable)) native_length(float v) { return fast_length(v);} 1496float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);} 1497float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);} 1498float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);} 1499 1500float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;} 1501float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;} 1502float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;} 1503float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;} 1504 1505float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;} 1506float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;} 1507float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;} 1508float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;} 1509 1510 1511 1512 1513 1514#undef FN_FUNC_FN 1515#undef IN_FUNC_FN 1516#undef FN_FUNC_FN_FN 1517#undef FN_FUNC_FN_F 1518#undef FN_FUNC_FN_IN 1519#undef FN_FUNC_FN_I 1520#undef FN_FUNC_FN_PFN 1521#undef FN_FUNC_FN_PIN 1522#undef FN_FUNC_FN_FN_FN 1523#undef FN_FUNC_FN_FN_PIN 1524#undef XN_FUNC_YN 1525#undef UIN_FUNC_IN 1526#undef IN_FUNC_IN 1527#undef XN_FUNC_XN_XN_BODY 1528#undef IN_FUNC_IN_IN_BODY 1529 1530typedef union { 1531 half hval; 1532 short sval; 1533} fp16_shape_type; 1534 1535/* half h = unsigned short s; */ 1536#define SET_HALF_WORD(h, s) \ 1537do { \ 1538 fp16_shape_type fp16_u; \ 1539 fp16_u.sval = (s); \ 1540 (h) = fp16_u.hval; \ 1541} while (0) 1542 1543static const unsigned short kHalfPositiveInfinity = 0x7c00; 1544 1545/* Define f16 functions of the form 1546 * HN output = fn(HN input) 1547 * where HN is scalar or vector half type 1548 */ 1549#define HN_FUNC_HN(fn) \ 1550extern half __attribute__((overloadable)) fn(half h) { \ 1551 return (half) fn((float) h); \ 1552} \ 1553extern half2 __attribute__((overloadable)) fn(half2 v) { \ 1554 return convert_half2(fn(convert_float2(v))); \ 1555} \ 1556extern half3 __attribute__((overloadable)) fn(half3 v) { \ 1557 return convert_half3(fn(convert_float3(v))); \ 1558} \ 1559extern half4 __attribute__((overloadable)) fn(half4 v) { \ 1560 return convert_half4(fn(convert_float4(v))); \ 1561} 1562 1563/* Define f16 functions of the form 1564 * HN output = fn(HN input1, HN input2) 1565 * where HN is scalar or vector half type 1566 */ 1567#define HN_FUNC_HN_HN(fn) \ 1568extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1569 return (half) fn((float) h1, (float) h2); \ 1570} \ 1571extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1572 return convert_half2(fn(convert_float2(v1), \ 1573 convert_float2(v2))); \ 1574} \ 1575extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1576 return convert_half3(fn(convert_float3(v1), \ 1577 convert_float3(v2))); \ 1578} \ 1579extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1580 return convert_half4(fn(convert_float4(v1), \ 1581 convert_float4(v2))); \ 1582} 1583 1584/* Define f16 functions of the form 1585 * HN output = fn(HN input1, half input2) 1586 * where HN is scalar or vector half type 1587 */ 1588#define HN_FUNC_HN_H(fn) \ 1589extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \ 1590 return convert_half2(fn(convert_float2(v1), (float) v2)); \ 1591} \ 1592extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \ 1593 return convert_half3(fn(convert_float3(v1), (float) v2)); \ 1594} \ 1595extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \ 1596 return convert_half4(fn(convert_float4(v1), (float) v2)); \ 1597} 1598 1599/* Define f16 functions of the form 1600 * HN output = fn(HN input1, HN input2, HN input3) 1601 * where HN is scalar or vector half type 1602 */ 1603#define HN_FUNC_HN_HN_HN(fn) \ 1604extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \ 1605 return (half) fn((float) h1, (float) h2, (float) h3); \ 1606} \ 1607extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \ 1608 return convert_half2(fn(convert_float2(v1), \ 1609 convert_float2(v2), \ 1610 convert_float2(v3))); \ 1611} \ 1612extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \ 1613 return convert_half3(fn(convert_float3(v1), \ 1614 convert_float3(v2), \ 1615 convert_float3(v3))); \ 1616} \ 1617extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \ 1618 return convert_half4(fn(convert_float4(v1), \ 1619 convert_float4(v2), \ 1620 convert_float4(v3))); \ 1621} 1622 1623/* Define f16 functions of the form 1624 * HN output = fn(HN input1, IN input2) 1625 * where HN is scalar or vector half type and IN the equivalent integer type 1626 * of same vector length. 1627 */ 1628#define HN_FUNC_HN_IN(fn) \ 1629extern half __attribute__((overloadable)) fn(half h1, int v) { \ 1630 return (half) fn((float) h1, v); \ 1631} \ 1632extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \ 1633 return convert_half2(fn(convert_float2(v1), v2)); \ 1634} \ 1635extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \ 1636 return convert_half3(fn(convert_float3(v1), v2)); \ 1637} \ 1638extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \ 1639 return convert_half4(fn(convert_float4(v1), v2)); \ 1640} 1641 1642/* Define f16 functions of the form 1643 * half output = fn(HN input1) 1644 * where HN is a scalar or vector half type. 1645 */ 1646#define H_FUNC_HN(fn) \ 1647extern half __attribute__((overloadable)) fn(half h) { \ 1648 return (half) fn((float) h); \ 1649} \ 1650extern half __attribute__((overloadable)) fn(half2 v) { \ 1651 return fn(convert_float2(v)); \ 1652} \ 1653extern half __attribute__((overloadable)) fn(half3 v) { \ 1654 return fn(convert_float3(v)); \ 1655} \ 1656extern half __attribute__((overloadable)) fn(half4 v) { \ 1657 return fn(convert_float4(v)); \ 1658} 1659 1660/* Define f16 functions of the form 1661 * half output = fn(HN input1, HN input2) 1662 * where HN is a scalar or vector half type. 1663 */ 1664#define H_FUNC_HN_HN(fn) \ 1665extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1666 return (half) fn((float) h1, (float) h2); \ 1667} \ 1668extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1669 return fn(convert_float2(v1), convert_float2(v2)); \ 1670} \ 1671extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1672 return fn(convert_float3(v1), convert_float3(v2)); \ 1673} \ 1674extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1675 return fn(convert_float4(v1), convert_float4(v2)); \ 1676} 1677 1678/* Define f16 functions of the form 1679 * HN output = fn(HN input1, HN input2) 1680 * where HN is a vector half type. The functions are defined to call the 1681 * scalar function of the same name. 1682 */ 1683#define SCALARIZE_HN_FUNC_HN_HN(fn) \ 1684extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1685 half2 ret; \ 1686 ret.x = fn(v1.x, v2.x); \ 1687 ret.y = fn(v1.y, v2.y); \ 1688 return ret; \ 1689} \ 1690extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1691 half3 ret; \ 1692 ret.x = fn(v1.x, v2.x); \ 1693 ret.y = fn(v1.y, v2.y); \ 1694 ret.z = fn(v1.z, v2.z); \ 1695 return ret; \ 1696} \ 1697extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1698 half4 ret; \ 1699 ret.x = fn(v1.x, v2.x); \ 1700 ret.y = fn(v1.y, v2.y); \ 1701 ret.z = fn(v1.z, v2.z); \ 1702 ret.w = fn(v1.w, v2.w); \ 1703 return ret; \ 1704} \ 1705 1706HN_FUNC_HN(acos); 1707HN_FUNC_HN(acosh); 1708HN_FUNC_HN(acospi); 1709HN_FUNC_HN(asin); 1710HN_FUNC_HN(asinh); 1711HN_FUNC_HN(asinpi); 1712HN_FUNC_HN(atan); 1713HN_FUNC_HN(atanh); 1714HN_FUNC_HN(atanpi); 1715HN_FUNC_HN_HN(atan2); 1716HN_FUNC_HN_HN(atan2pi); 1717 1718HN_FUNC_HN(cbrt); 1719HN_FUNC_HN(ceil); 1720 1721// TODO Add copysign 1722 1723HN_FUNC_HN(cos); 1724HN_FUNC_HN(cosh); 1725HN_FUNC_HN(cospi); 1726 1727extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) { 1728 half3 r; 1729 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1730 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1731 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1732 return r; 1733} 1734 1735extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) { 1736 half4 r; 1737 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1738 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1739 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1740 r.w = 0.f; 1741 return r; 1742} 1743 1744HN_FUNC_HN(degrees); 1745H_FUNC_HN_HN(distance); 1746H_FUNC_HN_HN(dot); 1747 1748HN_FUNC_HN(erf); 1749HN_FUNC_HN(erfc); 1750HN_FUNC_HN(exp); 1751HN_FUNC_HN(exp10); 1752HN_FUNC_HN(exp2); 1753HN_FUNC_HN(expm1); 1754 1755HN_FUNC_HN(fabs); 1756HN_FUNC_HN_HN(fdim); 1757HN_FUNC_HN(floor); 1758HN_FUNC_HN_HN_HN(fma); 1759HN_FUNC_HN_HN(fmax); 1760HN_FUNC_HN_H(fmax); 1761HN_FUNC_HN_HN(fmin); 1762HN_FUNC_HN_H(fmin); 1763HN_FUNC_HN_HN(fmod); 1764 1765// TODO Add (both variants) of fract 1766// TODO Add frexp 1767 1768HN_FUNC_HN_HN(hypot); 1769 1770// TODO Add ilogb 1771 1772HN_FUNC_HN_IN(ldexp); 1773extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) { 1774 return convert_half2(ldexp(convert_float2(v), exponent)); 1775} 1776extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) { 1777 return convert_half3(ldexp(convert_float3(v), exponent)); 1778} 1779extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) { 1780 return convert_half4(ldexp(convert_float4(v), exponent)); 1781} 1782 1783H_FUNC_HN(length); 1784HN_FUNC_HN(lgamma); 1785 1786extern half __attribute__((overloadable)) lgamma(half h, int *signp) { 1787 return (half) lgamma((float) h, signp); 1788} 1789extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) { 1790 return convert_half2(lgamma(convert_float2(v), signp)); 1791} 1792extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) { 1793 return convert_half3(lgamma(convert_float3(v), signp)); 1794} 1795extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) { 1796 return convert_half4(lgamma(convert_float4(v), signp)); 1797} 1798 1799HN_FUNC_HN(log); 1800HN_FUNC_HN(log10); 1801HN_FUNC_HN(log1p); 1802HN_FUNC_HN(log2); 1803HN_FUNC_HN(logb); 1804 1805HN_FUNC_HN_HN_HN(mad); 1806HN_FUNC_HN_HN(max); 1807HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff? 1808HN_FUNC_HN_HN(min); 1809HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff? 1810 1811extern half __attribute__((overloadable)) mix(half start, half stop, half amount) { 1812 return start + (stop - start) * amount; 1813} 1814extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) { 1815 return start + (stop - start) * amount; 1816} 1817extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) { 1818 return start + (stop - start) * amount; 1819} 1820extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) { 1821 return start + (stop - start) * amount; 1822} 1823extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) { 1824 return start + (stop - start) * amount; 1825} 1826extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) { 1827 return start + (stop - start) * amount; 1828} 1829extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) { 1830 return start + (stop - start) * amount; 1831} 1832 1833// TODO Define modf. Does it make sense to delegate to the float? 1834 1835half __attribute__((overloadable)) nan_half() { 1836 unsigned short nan_short = kHalfPositiveInfinity | 0x0200; 1837 half nan; 1838 SET_HALF_WORD(nan, nan_short); 1839 return nan; 1840} 1841 1842// TODO Add nextafter 1843 1844HN_FUNC_HN(normalize); 1845 1846HN_FUNC_HN_HN(pow); 1847HN_FUNC_HN_IN(pown); 1848HN_FUNC_HN_HN(powr); 1849HN_FUNC_HN(radians); 1850HN_FUNC_HN_HN(remainder); 1851 1852extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) { 1853 return (float) remquo((float) n, (float) d, quo); 1854} 1855extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) { 1856 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo)); 1857} 1858extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) { 1859 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo)); 1860} 1861extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) { 1862 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo)); 1863} 1864 1865HN_FUNC_HN(rint); 1866HN_FUNC_HN_IN(rootn); 1867HN_FUNC_HN(round); 1868HN_FUNC_HN(rsqrt); 1869 1870extern half __attribute__((overloadable)) sign(half h) { 1871 if (h > 0) return (half) 1.f; 1872 if (h < 0) return (half) -1.f; 1873 return h; 1874} 1875extern half2 __attribute__((overloadable)) sign(half2 v) { 1876 half2 ret; 1877 ret.x = sign(v.x); 1878 ret.y = sign(v.y); 1879 return ret; 1880} 1881extern half3 __attribute__((overloadable)) sign(half3 v) { 1882 half3 ret; 1883 ret.x = sign(v.x); 1884 ret.y = sign(v.y); 1885 ret.z = sign(v.z); 1886 return ret; 1887} 1888extern half4 __attribute__((overloadable)) sign(half4 v) { 1889 half4 ret; 1890 ret.x = sign(v.x); 1891 ret.y = sign(v.y); 1892 ret.z = sign(v.z); 1893 ret.w = sign(v.w); 1894 return ret; 1895} 1896 1897HN_FUNC_HN(sin); 1898 1899extern half __attribute__((overloadable)) sincos(half v, half *cosptr) { 1900 *cosptr = cos(v); 1901 return sin(v); 1902} 1903// TODO verify if LLVM eliminates the duplicate convert_float2 1904extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) { 1905 *cosptr = cos(v); 1906 return sin(v); 1907} 1908extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) { 1909 *cosptr = cos(v); 1910 return sin(v); 1911} 1912extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) { 1913 *cosptr = cos(v); 1914 return sin(v); 1915} 1916 1917HN_FUNC_HN(sinh); 1918HN_FUNC_HN(sinpi); 1919HN_FUNC_HN(sqrt); 1920 1921extern half __attribute__((overloadable)) step(half edge, half v) { 1922 return (v < edge) ? 0.f : 1.f; 1923} 1924extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) { 1925 half2 r; 1926 r.x = (v.x < edge.x) ? 0.f : 1.f; 1927 r.y = (v.y < edge.y) ? 0.f : 1.f; 1928 return r; 1929} 1930extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) { 1931 half3 r; 1932 r.x = (v.x < edge.x) ? 0.f : 1.f; 1933 r.y = (v.y < edge.y) ? 0.f : 1.f; 1934 r.z = (v.z < edge.z) ? 0.f : 1.f; 1935 return r; 1936} 1937extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) { 1938 half4 r; 1939 r.x = (v.x < edge.x) ? 0.f : 1.f; 1940 r.y = (v.y < edge.y) ? 0.f : 1.f; 1941 r.z = (v.z < edge.z) ? 0.f : 1.f; 1942 r.w = (v.w < edge.w) ? 0.f : 1.f; 1943 return r; 1944} 1945extern half2 __attribute__((overloadable)) step(half2 edge, half v) { 1946 half2 r; 1947 r.x = (v < edge.x) ? 0.f : 1.f; 1948 r.y = (v < edge.y) ? 0.f : 1.f; 1949 return r; 1950} 1951extern half3 __attribute__((overloadable)) step(half3 edge, half v) { 1952 half3 r; 1953 r.x = (v < edge.x) ? 0.f : 1.f; 1954 r.y = (v < edge.y) ? 0.f : 1.f; 1955 r.z = (v < edge.z) ? 0.f : 1.f; 1956 return r; 1957} 1958extern half4 __attribute__((overloadable)) step(half4 edge, half v) { 1959 half4 r; 1960 r.x = (v < edge.x) ? 0.f : 1.f; 1961 r.y = (v < edge.y) ? 0.f : 1.f; 1962 r.z = (v < edge.z) ? 0.f : 1.f; 1963 r.w = (v < edge.w) ? 0.f : 1.f; 1964 return r; 1965} 1966extern half2 __attribute__((overloadable)) step(half edge, half2 v) { 1967 half2 r; 1968 r.x = (v.x < edge) ? 0.f : 1.f; 1969 r.y = (v.y < edge) ? 0.f : 1.f; 1970 return r; 1971} 1972extern half3 __attribute__((overloadable)) step(half edge, half3 v) { 1973 half3 r; 1974 r.x = (v.x < edge) ? 0.f : 1.f; 1975 r.y = (v.y < edge) ? 0.f : 1.f; 1976 r.z = (v.z < edge) ? 0.f : 1.f; 1977 return r; 1978} 1979extern half4 __attribute__((overloadable)) step(half edge, half4 v) { 1980 half4 r; 1981 r.x = (v.x < edge) ? 0.f : 1.f; 1982 r.y = (v.y < edge) ? 0.f : 1.f; 1983 r.z = (v.z < edge) ? 0.f : 1.f; 1984 r.w = (v.w < edge) ? 0.f : 1.f; 1985 return r; 1986} 1987 1988HN_FUNC_HN(tan); 1989HN_FUNC_HN(tanh); 1990HN_FUNC_HN(tanpi); 1991HN_FUNC_HN(tgamma); 1992HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation? 1993 1994HN_FUNC_HN(native_acos); 1995HN_FUNC_HN(native_acosh); 1996HN_FUNC_HN(native_acospi); 1997HN_FUNC_HN(native_asin); 1998HN_FUNC_HN(native_asinh); 1999HN_FUNC_HN(native_asinpi); 2000HN_FUNC_HN(native_atan); 2001HN_FUNC_HN(native_atanh); 2002HN_FUNC_HN(native_atanpi); 2003HN_FUNC_HN_HN(native_atan2); 2004HN_FUNC_HN_HN(native_atan2pi); 2005 2006HN_FUNC_HN(native_cbrt); 2007HN_FUNC_HN(native_cos); 2008HN_FUNC_HN(native_cosh); 2009HN_FUNC_HN(native_cospi); 2010 2011H_FUNC_HN_HN(native_distance); 2012HN_FUNC_HN_HN(native_divide); 2013 2014HN_FUNC_HN(native_exp); 2015HN_FUNC_HN(native_exp10); 2016HN_FUNC_HN(native_exp2); 2017HN_FUNC_HN(native_expm1); 2018 2019HN_FUNC_HN_HN(native_hypot); 2020H_FUNC_HN(native_length); 2021 2022HN_FUNC_HN(native_log); 2023HN_FUNC_HN(native_log10); 2024HN_FUNC_HN(native_log1p); 2025HN_FUNC_HN(native_log2); 2026 2027HN_FUNC_HN(native_normalize); 2028 2029HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half? 2030 2031HN_FUNC_HN(native_recip); 2032HN_FUNC_HN_IN(native_rootn); 2033HN_FUNC_HN(native_rsqrt); 2034 2035HN_FUNC_HN(native_sin); 2036 2037extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) { 2038 return sincos(v, cosptr); 2039} 2040extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) { 2041 return sincos(v, cosptr); 2042} 2043extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) { 2044 return sincos(v, cosptr); 2045} 2046extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) { 2047 return sincos(v, cosptr); 2048} 2049 2050HN_FUNC_HN(native_sinh); 2051HN_FUNC_HN(native_sinpi); 2052HN_FUNC_HN(native_sqrt); 2053 2054HN_FUNC_HN(native_tan); 2055HN_FUNC_HN(native_tanh); 2056HN_FUNC_HN(native_tanpi); 2057 2058#undef HN_FUNC_HN 2059#undef HN_FUNC_HN_HN 2060#undef HN_FUNC_HN_H 2061#undef HN_FUNC_HN_HN_HN 2062#undef HN_FUNC_HN_IN 2063#undef H_FUNC_HN 2064#undef H_FUNC_HN_HN 2065#undef SCALARIZE_HN_FUNC_HN_HN 2066 2067