1#include "rs_core.rsh" 2#include "rs_f16_util.h" 3 4extern float2 __attribute__((overloadable)) convert_float2(int2 c); 5extern float3 __attribute__((overloadable)) convert_float3(int3 c); 6extern float4 __attribute__((overloadable)) convert_float4(int4 c); 7 8extern int2 __attribute__((overloadable)) convert_int2(float2 c); 9extern int3 __attribute__((overloadable)) convert_int3(float3 c); 10extern int4 __attribute__((overloadable)) convert_int4(float4 c); 11 12 13extern float __attribute__((overloadable)) fmin(float v, float v2); 14extern float2 __attribute__((overloadable)) fmin(float2 v, float v2); 15extern float3 __attribute__((overloadable)) fmin(float3 v, float v2); 16extern float4 __attribute__((overloadable)) fmin(float4 v, float v2); 17 18extern float __attribute__((overloadable)) fmax(float v, float v2); 19extern float2 __attribute__((overloadable)) fmax(float2 v, float v2); 20extern float3 __attribute__((overloadable)) fmax(float3 v, float v2); 21extern float4 __attribute__((overloadable)) fmax(float4 v, float v2); 22 23// Float ops, 6.11.2 24 25#define FN_FUNC_FN(fnc) \ 26extern float2 __attribute__((overloadable)) fnc(float2 v) { \ 27 float2 r; \ 28 r.x = fnc(v.x); \ 29 r.y = fnc(v.y); \ 30 return r; \ 31} \ 32extern float3 __attribute__((overloadable)) fnc(float3 v) { \ 33 float3 r; \ 34 r.x = fnc(v.x); \ 35 r.y = fnc(v.y); \ 36 r.z = fnc(v.z); \ 37 return r; \ 38} \ 39extern float4 __attribute__((overloadable)) fnc(float4 v) { \ 40 float4 r; \ 41 r.x = fnc(v.x); \ 42 r.y = fnc(v.y); \ 43 r.z = fnc(v.z); \ 44 r.w = fnc(v.w); \ 45 return r; \ 46} 47 48#define IN_FUNC_FN(fnc) \ 49extern int2 __attribute__((overloadable)) fnc(float2 v) { \ 50 int2 r; \ 51 r.x = fnc(v.x); \ 52 r.y = fnc(v.y); \ 53 return r; \ 54} \ 55extern int3 __attribute__((overloadable)) fnc(float3 v) { \ 56 int3 r; \ 57 r.x = fnc(v.x); \ 58 r.y = fnc(v.y); \ 59 r.z = fnc(v.z); \ 60 return r; \ 61} \ 62extern int4 __attribute__((overloadable)) fnc(float4 v) { \ 63 int4 r; \ 64 r.x = fnc(v.x); \ 65 r.y = fnc(v.y); \ 66 r.z = fnc(v.z); \ 67 r.w = fnc(v.w); \ 68 return r; \ 69} 70 71#define FN_FUNC_FN_FN(fnc) \ 72extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \ 73 float2 r; \ 74 r.x = fnc(v1.x, v2.x); \ 75 r.y = fnc(v1.y, v2.y); \ 76 return r; \ 77} \ 78extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \ 79 float3 r; \ 80 r.x = fnc(v1.x, v2.x); \ 81 r.y = fnc(v1.y, v2.y); \ 82 r.z = fnc(v1.z, v2.z); \ 83 return r; \ 84} \ 85extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \ 86 float4 r; \ 87 r.x = fnc(v1.x, v2.x); \ 88 r.y = fnc(v1.y, v2.y); \ 89 r.z = fnc(v1.z, v2.z); \ 90 r.w = fnc(v1.w, v2.w); \ 91 return r; \ 92} 93 94#define FN_FUNC_FN_F(fnc) \ 95extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \ 96 float2 r; \ 97 r.x = fnc(v1.x, v2); \ 98 r.y = fnc(v1.y, v2); \ 99 return r; \ 100} \ 101extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \ 102 float3 r; \ 103 r.x = fnc(v1.x, v2); \ 104 r.y = fnc(v1.y, v2); \ 105 r.z = fnc(v1.z, v2); \ 106 return r; \ 107} \ 108extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \ 109 float4 r; \ 110 r.x = fnc(v1.x, v2); \ 111 r.y = fnc(v1.y, v2); \ 112 r.z = fnc(v1.z, v2); \ 113 r.w = fnc(v1.w, v2); \ 114 return r; \ 115} 116 117#define FN_FUNC_FN_IN(fnc) \ 118extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \ 119 float2 r; \ 120 r.x = fnc(v1.x, v2.x); \ 121 r.y = fnc(v1.y, v2.y); \ 122 return r; \ 123} \ 124extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \ 125 float3 r; \ 126 r.x = fnc(v1.x, v2.x); \ 127 r.y = fnc(v1.y, v2.y); \ 128 r.z = fnc(v1.z, v2.z); \ 129 return r; \ 130} \ 131extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \ 132 float4 r; \ 133 r.x = fnc(v1.x, v2.x); \ 134 r.y = fnc(v1.y, v2.y); \ 135 r.z = fnc(v1.z, v2.z); \ 136 r.w = fnc(v1.w, v2.w); \ 137 return r; \ 138} 139 140#define FN_FUNC_FN_I(fnc) \ 141extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \ 142 float2 r; \ 143 r.x = fnc(v1.x, v2); \ 144 r.y = fnc(v1.y, v2); \ 145 return r; \ 146} \ 147extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \ 148 float3 r; \ 149 r.x = fnc(v1.x, v2); \ 150 r.y = fnc(v1.y, v2); \ 151 r.z = fnc(v1.z, v2); \ 152 return r; \ 153} \ 154extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \ 155 float4 r; \ 156 r.x = fnc(v1.x, v2); \ 157 r.y = fnc(v1.y, v2); \ 158 r.z = fnc(v1.z, v2); \ 159 r.w = fnc(v1.w, v2); \ 160 return r; \ 161} 162 163#define FN_FUNC_FN_PFN(fnc) \ 164extern float2 __attribute__((overloadable)) \ 165 fnc(float2 v1, float2 *v2) { \ 166 float2 r; \ 167 float t[2]; \ 168 r.x = fnc(v1.x, &t[0]); \ 169 r.y = fnc(v1.y, &t[1]); \ 170 v2->x = t[0]; \ 171 v2->y = t[1]; \ 172 return r; \ 173} \ 174extern float3 __attribute__((overloadable)) \ 175 fnc(float3 v1, float3 *v2) { \ 176 float3 r; \ 177 float t[3]; \ 178 r.x = fnc(v1.x, &t[0]); \ 179 r.y = fnc(v1.y, &t[1]); \ 180 r.z = fnc(v1.z, &t[2]); \ 181 v2->x = t[0]; \ 182 v2->y = t[1]; \ 183 v2->z = t[2]; \ 184 return r; \ 185} \ 186extern float4 __attribute__((overloadable)) \ 187 fnc(float4 v1, float4 *v2) { \ 188 float4 r; \ 189 float t[4]; \ 190 r.x = fnc(v1.x, &t[0]); \ 191 r.y = fnc(v1.y, &t[1]); \ 192 r.z = fnc(v1.z, &t[2]); \ 193 r.w = fnc(v1.w, &t[3]); \ 194 v2->x = t[0]; \ 195 v2->y = t[1]; \ 196 v2->z = t[2]; \ 197 v2->w = t[3]; \ 198 return r; \ 199} 200 201#define FN_FUNC_FN_PIN(fnc) \ 202extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \ 203 float2 r; \ 204 int t[2]; \ 205 r.x = fnc(v1.x, &t[0]); \ 206 r.y = fnc(v1.y, &t[1]); \ 207 v2->x = t[0]; \ 208 v2->y = t[1]; \ 209 return r; \ 210} \ 211extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \ 212 float3 r; \ 213 int t[3]; \ 214 r.x = fnc(v1.x, &t[0]); \ 215 r.y = fnc(v1.y, &t[1]); \ 216 r.z = fnc(v1.z, &t[2]); \ 217 v2->x = t[0]; \ 218 v2->y = t[1]; \ 219 v2->z = t[2]; \ 220 return r; \ 221} \ 222extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \ 223 float4 r; \ 224 int t[4]; \ 225 r.x = fnc(v1.x, &t[0]); \ 226 r.y = fnc(v1.y, &t[1]); \ 227 r.z = fnc(v1.z, &t[2]); \ 228 r.w = fnc(v1.w, &t[3]); \ 229 v2->x = t[0]; \ 230 v2->y = t[1]; \ 231 v2->z = t[2]; \ 232 v2->w = t[3]; \ 233 return r; \ 234} 235 236#define FN_FUNC_FN_FN_FN(fnc) \ 237extern float2 __attribute__((overloadable)) \ 238 fnc(float2 v1, float2 v2, float2 v3) { \ 239 float2 r; \ 240 r.x = fnc(v1.x, v2.x, v3.x); \ 241 r.y = fnc(v1.y, v2.y, v3.y); \ 242 return r; \ 243} \ 244extern float3 __attribute__((overloadable)) \ 245 fnc(float3 v1, float3 v2, float3 v3) { \ 246 float3 r; \ 247 r.x = fnc(v1.x, v2.x, v3.x); \ 248 r.y = fnc(v1.y, v2.y, v3.y); \ 249 r.z = fnc(v1.z, v2.z, v3.z); \ 250 return r; \ 251} \ 252extern float4 __attribute__((overloadable)) \ 253 fnc(float4 v1, float4 v2, float4 v3) { \ 254 float4 r; \ 255 r.x = fnc(v1.x, v2.x, v3.x); \ 256 r.y = fnc(v1.y, v2.y, v3.y); \ 257 r.z = fnc(v1.z, v2.z, v3.z); \ 258 r.w = fnc(v1.w, v2.w, v3.w); \ 259 return r; \ 260} 261 262#define FN_FUNC_FN_FN_PIN(fnc) \ 263extern float2 __attribute__((overloadable)) \ 264 fnc(float2 v1, float2 v2, int2 *v3) { \ 265 float2 r; \ 266 int t[2]; \ 267 r.x = fnc(v1.x, v2.x, &t[0]); \ 268 r.y = fnc(v1.y, v2.y, &t[1]); \ 269 v3->x = t[0]; \ 270 v3->y = t[1]; \ 271 return r; \ 272} \ 273extern float3 __attribute__((overloadable)) \ 274 fnc(float3 v1, float3 v2, int3 *v3) { \ 275 float3 r; \ 276 int t[3]; \ 277 r.x = fnc(v1.x, v2.x, &t[0]); \ 278 r.y = fnc(v1.y, v2.y, &t[1]); \ 279 r.z = fnc(v1.z, v2.z, &t[2]); \ 280 v3->x = t[0]; \ 281 v3->y = t[1]; \ 282 v3->z = t[2]; \ 283 return r; \ 284} \ 285extern float4 __attribute__((overloadable)) \ 286 fnc(float4 v1, float4 v2, int4 *v3) { \ 287 float4 r; \ 288 int t[4]; \ 289 r.x = fnc(v1.x, v2.x, &t[0]); \ 290 r.y = fnc(v1.y, v2.y, &t[1]); \ 291 r.z = fnc(v1.z, v2.z, &t[2]); \ 292 r.w = fnc(v1.w, v2.w, &t[3]); \ 293 v3->x = t[0]; \ 294 v3->y = t[1]; \ 295 v3->z = t[2]; \ 296 v3->w = t[3]; \ 297 return r; \ 298} 299 300static const unsigned int iposinf = 0x7f800000; 301static const unsigned int ineginf = 0xff800000; 302 303static float posinf() { 304 float f = *((float*)&iposinf); 305 return f; 306} 307 308static unsigned int float_bits(float f) { 309 /* TODO(jeanluc) Use this better approach once the Mac(SDK) build issues are fixed. 310 // Get the bits while following the strict aliasing rules. 311 unsigned int result; 312 memcpy(&result, &f, sizeof(f)); 313 return result; 314 */ 315 return *(unsigned int*)(char*)(&f); 316} 317 318static bool isinf(float f) { 319 unsigned int i = float_bits(f); 320 return (i == iposinf) || (i == ineginf); 321} 322 323static bool isnan(float f) { 324 unsigned int i = float_bits(f); 325 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff)); 326} 327 328static bool isposzero(float f) { 329 return (float_bits(f) == 0x00000000); 330} 331 332static bool isnegzero(float f) { 333 return (float_bits(f) == 0x80000000); 334} 335 336static bool iszero(float f) { 337 return isposzero(f) || isnegzero(f); 338} 339 340 341extern float __attribute__((overloadable)) SC_acosf(float); 342float __attribute__((overloadable)) acos(float v) { 343 return SC_acosf(v); 344} 345FN_FUNC_FN(acos) 346 347extern float __attribute__((overloadable)) SC_acoshf(float); 348float __attribute__((overloadable)) acosh(float v) { 349 return SC_acoshf(v); 350} 351FN_FUNC_FN(acosh) 352 353 354extern float __attribute__((overloadable)) acospi(float v) { 355 return acos(v) / M_PI; 356} 357FN_FUNC_FN(acospi) 358 359extern float __attribute__((overloadable)) SC_asinf(float); 360float __attribute__((overloadable)) asin(float v) { 361 return SC_asinf(v); 362} 363FN_FUNC_FN(asin) 364 365extern float __attribute__((overloadable)) SC_asinhf(float); 366float __attribute__((overloadable)) asinh(float v) { 367 return SC_asinhf(v); 368} 369FN_FUNC_FN(asinh) 370 371extern float __attribute__((overloadable)) asinpi(float v) { 372 return asin(v) / M_PI; 373} 374FN_FUNC_FN(asinpi) 375 376extern float __attribute__((overloadable)) SC_atanf(float); 377float __attribute__((overloadable)) atan(float v) { 378 return SC_atanf(v); 379} 380FN_FUNC_FN(atan) 381 382extern float __attribute__((overloadable)) SC_atan2f(float, float); 383float __attribute__((overloadable)) atan2(float v1, float v2) { 384 return SC_atan2f(v1, v2); 385} 386FN_FUNC_FN_FN(atan2) 387 388extern float __attribute__((overloadable)) SC_atanhf(float); 389float __attribute__((overloadable)) atanh(float v) { 390 return SC_atanhf(v); 391} 392FN_FUNC_FN(atanh) 393 394extern float __attribute__((overloadable)) atanpi(float v) { 395 return atan(v) / M_PI; 396} 397FN_FUNC_FN(atanpi) 398 399 400extern float __attribute__((overloadable)) atan2pi(float y, float x) { 401 return atan2(y, x) / M_PI; 402} 403FN_FUNC_FN_FN(atan2pi) 404 405extern float __attribute__((overloadable)) SC_cbrtf(float); 406float __attribute__((overloadable)) cbrt(float v) { 407 return SC_cbrtf(v); 408} 409FN_FUNC_FN(cbrt) 410 411extern float __attribute__((overloadable)) SC_ceilf(float); 412float __attribute__((overloadable)) ceil(float v) { 413 return SC_ceilf(v); 414} 415FN_FUNC_FN(ceil) 416 417extern float __attribute__((overloadable)) SC_copysignf(float, float); 418float __attribute__((overloadable)) copysign(float v1, float v2) { 419 return SC_copysignf(v1, v2); 420} 421FN_FUNC_FN_FN(copysign) 422 423extern float __attribute__((overloadable)) SC_cosf(float); 424float __attribute__((overloadable)) cos(float v) { 425 return SC_cosf(v); 426} 427FN_FUNC_FN(cos) 428 429extern float __attribute__((overloadable)) SC_coshf(float); 430float __attribute__((overloadable)) cosh(float v) { 431 return SC_coshf(v); 432} 433FN_FUNC_FN(cosh) 434 435extern float __attribute__((overloadable)) cospi(float v) { 436 return cos(v * M_PI); 437} 438FN_FUNC_FN(cospi) 439 440extern float __attribute__((overloadable)) SC_erfcf(float); 441float __attribute__((overloadable)) erfc(float v) { 442 return SC_erfcf(v); 443} 444FN_FUNC_FN(erfc) 445 446extern float __attribute__((overloadable)) SC_erff(float); 447float __attribute__((overloadable)) erf(float v) { 448 return SC_erff(v); 449} 450FN_FUNC_FN(erf) 451 452extern float __attribute__((overloadable)) SC_expf(float); 453float __attribute__((overloadable)) exp(float v) { 454 return SC_expf(v); 455} 456FN_FUNC_FN(exp) 457 458extern float __attribute__((overloadable)) SC_exp2f(float); 459float __attribute__((overloadable)) exp2(float v) { 460 return SC_exp2f(v); 461} 462FN_FUNC_FN(exp2) 463 464extern float __attribute__((overloadable)) pow(float, float); 465 466extern float __attribute__((overloadable)) exp10(float v) { 467 return exp2(v * 3.321928095f); 468} 469FN_FUNC_FN(exp10) 470 471extern float __attribute__((overloadable)) SC_expm1f(float); 472float __attribute__((overloadable)) expm1(float v) { 473 return SC_expm1f(v); 474} 475FN_FUNC_FN(expm1) 476 477extern float __attribute__((overloadable)) fabs(float v) { 478 int i = *((int*)(void*)&v) & 0x7fffffff; 479 return *((float*)(void*)&i); 480} 481FN_FUNC_FN(fabs) 482 483extern float __attribute__((overloadable)) SC_fdimf(float, float); 484float __attribute__((overloadable)) fdim(float v1, float v2) { 485 return SC_fdimf(v1, v2); 486} 487FN_FUNC_FN_FN(fdim) 488 489extern float __attribute__((overloadable)) SC_floorf(float); 490float __attribute__((overloadable)) floor(float v) { 491 return SC_floorf(v); 492} 493FN_FUNC_FN(floor) 494 495extern float __attribute__((overloadable)) SC_fmaf(float, float, float); 496float __attribute__((overloadable)) fma(float v1, float v2, float v3) { 497 return SC_fmaf(v1, v2, v3); 498} 499FN_FUNC_FN_FN_FN(fma) 500 501extern float __attribute__((overloadable)) SC_fminf(float, float); 502 503extern float __attribute__((overloadable)) SC_fmodf(float, float); 504float __attribute__((overloadable)) fmod(float v1, float v2) { 505 return SC_fmodf(v1, v2); 506} 507FN_FUNC_FN_FN(fmod) 508 509extern float __attribute__((overloadable)) fract(float v, float *iptr) { 510 int i = (int)floor(v); 511 if (iptr) { 512 iptr[0] = i; 513 } 514 return fmin(v - i, 0x1.fffffep-1f); 515} 516FN_FUNC_FN_PFN(fract) 517 518extern float __attribute__((const, overloadable)) fract(float v) { 519 float unused; 520 return fract(v, &unused); 521} 522FN_FUNC_FN(fract) 523 524extern float __attribute__((overloadable)) SC_frexpf(float, int *); 525float __attribute__((overloadable)) frexp(float v1, int* v2) { 526 return SC_frexpf(v1, v2); 527} 528FN_FUNC_FN_PIN(frexp) 529 530extern float __attribute__((overloadable)) SC_hypotf(float, float); 531float __attribute__((overloadable)) hypot(float v1, float v2) { 532 return SC_hypotf(v1, v2); 533} 534FN_FUNC_FN_FN(hypot) 535 536extern int __attribute__((overloadable)) SC_ilogbf(float); 537int __attribute__((overloadable)) ilogb(float v) { 538 return SC_ilogbf(v); 539} 540IN_FUNC_FN(ilogb) 541 542extern float __attribute__((overloadable)) SC_ldexpf(float, int); 543float __attribute__((overloadable)) ldexp(float v1, int v2) { 544 return SC_ldexpf(v1, v2); 545} 546FN_FUNC_FN_IN(ldexp) 547FN_FUNC_FN_I(ldexp) 548 549extern float __attribute__((overloadable)) SC_lgammaf(float); 550float __attribute__((overloadable)) lgamma(float v) { 551 return SC_lgammaf(v); 552} 553FN_FUNC_FN(lgamma) 554extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*); 555float __attribute__((overloadable)) lgamma(float v, int* ptr) { 556 return SC_lgammaf_r(v, ptr); 557} 558FN_FUNC_FN_PIN(lgamma) 559 560extern float __attribute__((overloadable)) SC_logf(float); 561float __attribute__((overloadable)) log(float v) { 562 return SC_logf(v); 563} 564FN_FUNC_FN(log) 565 566extern float __attribute__((overloadable)) SC_log10f(float); 567float __attribute__((overloadable)) log10(float v) { 568 return SC_log10f(v); 569} 570FN_FUNC_FN(log10) 571 572 573extern float __attribute__((overloadable)) log2(float v) { 574 return log10(v) * 3.321928095f; 575} 576FN_FUNC_FN(log2) 577 578extern float __attribute__((overloadable)) SC_log1pf(float); 579float __attribute__((overloadable)) log1p(float v) { 580 return SC_log1pf(v); 581} 582FN_FUNC_FN(log1p) 583 584extern float __attribute__((overloadable)) SC_logbf(float); 585float __attribute__((overloadable)) logb(float v) { 586 return SC_logbf(v); 587} 588FN_FUNC_FN(logb) 589 590extern float __attribute__((overloadable)) mad(float a, float b, float c) { 591 return a * b + c; 592} 593extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) { 594 return a * b + c; 595} 596extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) { 597 return a * b + c; 598} 599extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) { 600 return a * b + c; 601} 602 603extern float __attribute__((overloadable)) SC_modff(float, float *); 604float __attribute__((overloadable)) modf(float v1, float *v2) { 605 return SC_modff(v1, v2); 606} 607FN_FUNC_FN_PFN(modf); 608 609extern float __attribute__((overloadable)) nan(uint v) { 610 float f[1]; 611 uint32_t *ip = (uint32_t *)f; 612 *ip = v | 0x7fc00000; 613 return f[0]; 614} 615 616extern float __attribute__((overloadable)) SC_nextafterf(float, float); 617float __attribute__((overloadable)) nextafter(float v1, float v2) { 618 return SC_nextafterf(v1, v2); 619} 620FN_FUNC_FN_FN(nextafter) 621 622// This function must be defined here if we're compiling with debug info 623// (libclcore_g.bc), because we need a C source to get debug information. 624// Otherwise the implementation can be found in IR. 625#if defined(RS_G_RUNTIME) 626extern float __attribute__((overloadable)) SC_powf(float, float); 627float __attribute__((overloadable)) pow(float v1, float v2) { 628 return SC_powf(v1, v2); 629} 630#endif // defined(RS_G_RUNTIME) 631FN_FUNC_FN_FN(pow) 632 633extern float __attribute__((overloadable)) pown(float v, int p) { 634 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31). 635 * For very large ints, we'll lose whether the exponent is even or odd, making 636 * the selection of a correct sign incorrect. We correct this. Use copysign 637 * to handle the negative zero case. 638 */ 639 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f; 640 float f = pow(v, (float)p); 641 return copysign(f, sign); 642} 643FN_FUNC_FN_IN(pown) 644 645extern float __attribute__((overloadable)) powr(float v, float p) { 646 return pow(v, p); 647} 648extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) { 649 return pow(v, p); 650} 651extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) { 652 return pow(v, p); 653} 654extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) { 655 return pow(v, p); 656} 657 658extern float __attribute__((overloadable)) SC_remainderf(float, float); 659float __attribute__((overloadable)) remainder(float v1, float v2) { 660 return SC_remainderf(v1, v2); 661} 662FN_FUNC_FN_FN(remainder) 663 664extern float __attribute__((overloadable)) SC_remquof(float, float, int *); 665float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) { 666 return SC_remquof(v1, v2, v3); 667} 668FN_FUNC_FN_FN_PIN(remquo) 669 670extern float __attribute__((overloadable)) SC_rintf(float); 671float __attribute__((overloadable)) rint(float v) { 672 return SC_rintf(v); 673} 674FN_FUNC_FN(rint) 675 676extern float __attribute__((overloadable)) rootn(float v, int r) { 677 if (r == 0) { 678 return posinf(); 679 } 680 681 if (iszero(v)) { 682 if (r < 0) { 683 if (r & 1) { 684 return copysign(posinf(), v); 685 } else { 686 return posinf(); 687 } 688 } else { 689 if (r & 1) { 690 return copysign(0.f, v); 691 } else { 692 return 0.f; 693 } 694 } 695 } 696 697 if (!isinf(v) && !isnan(v) && (v < 0.f)) { 698 if (r & 1) { 699 return (-1.f * pow(-1.f * v, 1.f / r)); 700 } else { 701 return nan(0); 702 } 703 } 704 705 return pow(v, 1.f / r); 706} 707FN_FUNC_FN_IN(rootn); 708 709extern float __attribute__((overloadable)) SC_roundf(float); 710float __attribute__((overloadable)) round(float v) { 711 return SC_roundf(v); 712} 713FN_FUNC_FN(round) 714 715extern float __attribute__((overloadable)) SC_randf2(float, float); 716float __attribute__((overloadable)) rsRand(float min, float max) { 717 return SC_randf2(min, max); 718} 719 720 721extern float __attribute__((overloadable)) rsqrt(float v) { 722 return 1.f / sqrt(v); 723} 724 725#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME) 726// These functions must be defined here if we are not using the SSE 727// implementation, which includes when we are built as part of the 728// debug runtime (libclcore_debug.bc) or compiling with debug info. 729#if defined(RS_G_RUNTIME) 730extern float __attribute__((overloadable)) SC_sqrtf(float); 731float __attribute__((overloadable)) sqrt(float v) { 732 return SC_sqrtf(v); 733} 734#endif // defined(RS_G_RUNTIME) 735 736FN_FUNC_FN(sqrt) 737#else 738extern float2 __attribute__((overloadable)) sqrt(float2); 739extern float3 __attribute__((overloadable)) sqrt(float3); 740extern float4 __attribute__((overloadable)) sqrt(float4); 741#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME) 742 743FN_FUNC_FN(rsqrt) 744 745extern float __attribute__((overloadable)) SC_sinf(float); 746float __attribute__((overloadable)) sin(float v) { 747 return SC_sinf(v); 748} 749FN_FUNC_FN(sin) 750 751extern float __attribute__((overloadable)) sincos(float v, float *cosptr) { 752 *cosptr = cos(v); 753 return sin(v); 754} 755extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) { 756 *cosptr = cos(v); 757 return sin(v); 758} 759extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) { 760 *cosptr = cos(v); 761 return sin(v); 762} 763extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) { 764 *cosptr = cos(v); 765 return sin(v); 766} 767 768extern float __attribute__((overloadable)) SC_sinhf(float); 769float __attribute__((overloadable)) sinh(float v) { 770 return SC_sinhf(v); 771} 772FN_FUNC_FN(sinh) 773 774extern float __attribute__((overloadable)) sinpi(float v) { 775 return sin(v * M_PI); 776} 777FN_FUNC_FN(sinpi) 778 779extern float __attribute__((overloadable)) SC_tanf(float); 780float __attribute__((overloadable)) tan(float v) { 781 return SC_tanf(v); 782} 783FN_FUNC_FN(tan) 784 785extern float __attribute__((overloadable)) SC_tanhf(float); 786float __attribute__((overloadable)) tanh(float v) { 787 return SC_tanhf(v); 788} 789FN_FUNC_FN(tanh) 790 791extern float __attribute__((overloadable)) tanpi(float v) { 792 return tan(v * M_PI); 793} 794FN_FUNC_FN(tanpi) 795 796 797extern float __attribute__((overloadable)) SC_tgammaf(float); 798float __attribute__((overloadable)) tgamma(float v) { 799 return SC_tgammaf(v); 800} 801FN_FUNC_FN(tgamma) 802 803extern float __attribute__((overloadable)) SC_truncf(float); 804float __attribute__((overloadable)) trunc(float v) { 805 return SC_truncf(v); 806} 807FN_FUNC_FN(trunc) 808 809// Int ops (partial), 6.11.3 810 811#define XN_FUNC_YN(typeout, fnc, typein) \ 812extern typeout __attribute__((overloadable)) fnc(typein); \ 813extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \ 814 typeout##2 r; \ 815 r.x = fnc(v.x); \ 816 r.y = fnc(v.y); \ 817 return r; \ 818} \ 819extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \ 820 typeout##3 r; \ 821 r.x = fnc(v.x); \ 822 r.y = fnc(v.y); \ 823 r.z = fnc(v.z); \ 824 return r; \ 825} \ 826extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \ 827 typeout##4 r; \ 828 r.x = fnc(v.x); \ 829 r.y = fnc(v.y); \ 830 r.z = fnc(v.z); \ 831 r.w = fnc(v.w); \ 832 return r; \ 833} 834 835 836#define UIN_FUNC_IN(fnc) \ 837XN_FUNC_YN(uchar, fnc, char) \ 838XN_FUNC_YN(ushort, fnc, short) \ 839XN_FUNC_YN(uint, fnc, int) 840 841#define IN_FUNC_IN(fnc) \ 842XN_FUNC_YN(uchar, fnc, uchar) \ 843XN_FUNC_YN(char, fnc, char) \ 844XN_FUNC_YN(ushort, fnc, ushort) \ 845XN_FUNC_YN(short, fnc, short) \ 846XN_FUNC_YN(uint, fnc, uint) \ 847XN_FUNC_YN(int, fnc, int) 848 849 850#define XN_FUNC_XN_XN_BODY(type, fnc, body) \ 851extern type __attribute__((overloadable)) \ 852 fnc(type v1, type v2) { \ 853 return body; \ 854} \ 855extern type##2 __attribute__((overloadable)) \ 856 fnc(type##2 v1, type##2 v2) { \ 857 type##2 r; \ 858 r.x = fnc(v1.x, v2.x); \ 859 r.y = fnc(v1.y, v2.y); \ 860 return r; \ 861} \ 862extern type##3 __attribute__((overloadable)) \ 863 fnc(type##3 v1, type##3 v2) { \ 864 type##3 r; \ 865 r.x = fnc(v1.x, v2.x); \ 866 r.y = fnc(v1.y, v2.y); \ 867 r.z = fnc(v1.z, v2.z); \ 868 return r; \ 869} \ 870extern type##4 __attribute__((overloadable)) \ 871 fnc(type##4 v1, type##4 v2) { \ 872 type##4 r; \ 873 r.x = fnc(v1.x, v2.x); \ 874 r.y = fnc(v1.y, v2.y); \ 875 r.z = fnc(v1.z, v2.z); \ 876 r.w = fnc(v1.w, v2.w); \ 877 return r; \ 878} 879 880#define IN_FUNC_IN_IN_BODY(fnc, body) \ 881XN_FUNC_XN_XN_BODY(uchar, fnc, body) \ 882XN_FUNC_XN_XN_BODY(char, fnc, body) \ 883XN_FUNC_XN_XN_BODY(ushort, fnc, body) \ 884XN_FUNC_XN_XN_BODY(short, fnc, body) \ 885XN_FUNC_XN_XN_BODY(uint, fnc, body) \ 886XN_FUNC_XN_XN_BODY(int, fnc, body) \ 887XN_FUNC_XN_XN_BODY(float, fnc, body) 888 889 890/** 891 * abs 892 */ 893extern uint32_t __attribute__((overloadable)) abs(int32_t v) { 894 if (v < 0) 895 return -v; 896 return v; 897} 898extern uint16_t __attribute__((overloadable)) abs(int16_t v) { 899 if (v < 0) 900 return -v; 901 return v; 902} 903extern uint8_t __attribute__((overloadable)) abs(int8_t v) { 904 if (v < 0) 905 return -v; 906 return v; 907} 908 909/** 910 * clz 911 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be 912 * expanded to 32 bits. For our smaller data types, we need to subtract off 913 * these unused top bits (that will be always be composed of zeros). 914 */ 915extern uint32_t __attribute__((overloadable)) clz(uint32_t v) { 916 return __builtin_clz(v); 917} 918extern uint16_t __attribute__((overloadable)) clz(uint16_t v) { 919 return __builtin_clz(v) - 16; 920} 921extern uint8_t __attribute__((overloadable)) clz(uint8_t v) { 922 return __builtin_clz(v) - 24; 923} 924extern int32_t __attribute__((overloadable)) clz(int32_t v) { 925 return __builtin_clz(v); 926} 927extern int16_t __attribute__((overloadable)) clz(int16_t v) { 928 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16; 929} 930extern int8_t __attribute__((overloadable)) clz(int8_t v) { 931 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24; 932} 933 934 935UIN_FUNC_IN(abs) 936IN_FUNC_IN(clz) 937 938 939// 6.11.4 940 941 942extern float __attribute__((overloadable)) degrees(float radians) { 943 return radians * (180.f / M_PI); 944} 945extern float2 __attribute__((overloadable)) degrees(float2 radians) { 946 return radians * (180.f / M_PI); 947} 948extern float3 __attribute__((overloadable)) degrees(float3 radians) { 949 return radians * (180.f / M_PI); 950} 951extern float4 __attribute__((overloadable)) degrees(float4 radians) { 952 return radians * (180.f / M_PI); 953} 954 955extern float __attribute__((overloadable)) mix(float start, float stop, float amount) { 956 return start + (stop - start) * amount; 957} 958extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) { 959 return start + (stop - start) * amount; 960} 961extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) { 962 return start + (stop - start) * amount; 963} 964extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) { 965 return start + (stop - start) * amount; 966} 967extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) { 968 return start + (stop - start) * amount; 969} 970extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) { 971 return start + (stop - start) * amount; 972} 973extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) { 974 return start + (stop - start) * amount; 975} 976 977extern float __attribute__((overloadable)) radians(float degrees) { 978 return degrees * (M_PI / 180.f); 979} 980extern float2 __attribute__((overloadable)) radians(float2 degrees) { 981 return degrees * (M_PI / 180.f); 982} 983extern float3 __attribute__((overloadable)) radians(float3 degrees) { 984 return degrees * (M_PI / 180.f); 985} 986extern float4 __attribute__((overloadable)) radians(float4 degrees) { 987 return degrees * (M_PI / 180.f); 988} 989 990extern float __attribute__((overloadable)) step(float edge, float v) { 991 return (v < edge) ? 0.f : 1.f; 992} 993extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) { 994 float2 r; 995 r.x = (v.x < edge.x) ? 0.f : 1.f; 996 r.y = (v.y < edge.y) ? 0.f : 1.f; 997 return r; 998} 999extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) { 1000 float3 r; 1001 r.x = (v.x < edge.x) ? 0.f : 1.f; 1002 r.y = (v.y < edge.y) ? 0.f : 1.f; 1003 r.z = (v.z < edge.z) ? 0.f : 1.f; 1004 return r; 1005} 1006extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) { 1007 float4 r; 1008 r.x = (v.x < edge.x) ? 0.f : 1.f; 1009 r.y = (v.y < edge.y) ? 0.f : 1.f; 1010 r.z = (v.z < edge.z) ? 0.f : 1.f; 1011 r.w = (v.w < edge.w) ? 0.f : 1.f; 1012 return r; 1013} 1014extern float2 __attribute__((overloadable)) step(float2 edge, float v) { 1015 float2 r; 1016 r.x = (v < edge.x) ? 0.f : 1.f; 1017 r.y = (v < edge.y) ? 0.f : 1.f; 1018 return r; 1019} 1020extern float3 __attribute__((overloadable)) step(float3 edge, float v) { 1021 float3 r; 1022 r.x = (v < edge.x) ? 0.f : 1.f; 1023 r.y = (v < edge.y) ? 0.f : 1.f; 1024 r.z = (v < edge.z) ? 0.f : 1.f; 1025 return r; 1026} 1027extern float4 __attribute__((overloadable)) step(float4 edge, float v) { 1028 float4 r; 1029 r.x = (v < edge.x) ? 0.f : 1.f; 1030 r.y = (v < edge.y) ? 0.f : 1.f; 1031 r.z = (v < edge.z) ? 0.f : 1.f; 1032 r.w = (v < edge.w) ? 0.f : 1.f; 1033 return r; 1034} 1035extern float2 __attribute__((overloadable)) step(float edge, float2 v) { 1036 float2 r; 1037 r.x = (v.x < edge) ? 0.f : 1.f; 1038 r.y = (v.y < edge) ? 0.f : 1.f; 1039 return r; 1040} 1041extern float3 __attribute__((overloadable)) step(float edge, float3 v) { 1042 float3 r; 1043 r.x = (v.x < edge) ? 0.f : 1.f; 1044 r.y = (v.y < edge) ? 0.f : 1.f; 1045 r.z = (v.z < edge) ? 0.f : 1.f; 1046 return r; 1047} 1048extern float4 __attribute__((overloadable)) step(float edge, float4 v) { 1049 float4 r; 1050 r.x = (v.x < edge) ? 0.f : 1.f; 1051 r.y = (v.y < edge) ? 0.f : 1.f; 1052 r.z = (v.z < edge) ? 0.f : 1.f; 1053 r.w = (v.w < edge) ? 0.f : 1.f; 1054 return r; 1055} 1056 1057extern float __attribute__((overloadable)) sign(float v) { 1058 if (v > 0) return 1.f; 1059 if (v < 0) return -1.f; 1060 return v; 1061} 1062FN_FUNC_FN(sign) 1063 1064 1065// 6.11.5 1066extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) { 1067 float3 r; 1068 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1069 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1070 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1071 return r; 1072} 1073 1074extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) { 1075 float4 r; 1076 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1077 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1078 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1079 r.w = 0.f; 1080 return r; 1081} 1082 1083#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME) 1084// These functions must be defined here if we are not using the SSE 1085// implementation, which includes when we are built as part of the 1086// debug runtime (libclcore_debug.bc) or compiling with debug info. 1087 1088extern float __attribute__((overloadable)) dot(float lhs, float rhs) { 1089 return lhs * rhs; 1090} 1091extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) { 1092 return lhs.x*rhs.x + lhs.y*rhs.y; 1093} 1094extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) { 1095 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z; 1096} 1097extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) { 1098 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w; 1099} 1100 1101extern float __attribute__((overloadable)) length(float v) { 1102 return fabs(v); 1103} 1104extern float __attribute__((overloadable)) length(float2 v) { 1105 return sqrt(v.x*v.x + v.y*v.y); 1106} 1107extern float __attribute__((overloadable)) length(float3 v) { 1108 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1109} 1110extern float __attribute__((overloadable)) length(float4 v) { 1111 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1112} 1113 1114#else 1115 1116extern float __attribute__((overloadable)) length(float v); 1117extern float __attribute__((overloadable)) length(float2 v); 1118extern float __attribute__((overloadable)) length(float3 v); 1119extern float __attribute__((overloadable)) length(float4 v); 1120 1121#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME) 1122 1123extern float __attribute__((overloadable)) distance(float lhs, float rhs) { 1124 return length(lhs - rhs); 1125} 1126extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) { 1127 return length(lhs - rhs); 1128} 1129extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) { 1130 return length(lhs - rhs); 1131} 1132extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) { 1133 return length(lhs - rhs); 1134} 1135 1136/* For the normalization functions, vectors of length 0 should simply be 1137 * returned (i.e. all the components of that vector are 0). 1138 */ 1139extern float __attribute__((overloadable)) normalize(float v) { 1140 if (v == 0.0f) { 1141 return 0.0f; 1142 } else if (v < 0.0f) { 1143 return -1.0f; 1144 } else { 1145 return 1.0f; 1146 } 1147} 1148extern float2 __attribute__((overloadable)) normalize(float2 v) { 1149 float l = length(v); 1150 return l == 0.0f ? v : v / l; 1151} 1152extern float3 __attribute__((overloadable)) normalize(float3 v) { 1153 float l = length(v); 1154 return l == 0.0f ? v : v / l; 1155} 1156extern float4 __attribute__((overloadable)) normalize(float4 v) { 1157 float l = length(v); 1158 return l == 0.0f ? v : v / l; 1159} 1160 1161extern float __attribute__((overloadable)) half_sqrt(float v) { 1162 return sqrt(v); 1163} 1164FN_FUNC_FN(half_sqrt) 1165 1166extern float __attribute__((overloadable)) fast_length(float v) { 1167 return fabs(v); 1168} 1169extern float __attribute__((overloadable)) fast_length(float2 v) { 1170 return half_sqrt(v.x*v.x + v.y*v.y); 1171} 1172extern float __attribute__((overloadable)) fast_length(float3 v) { 1173 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1174} 1175extern float __attribute__((overloadable)) fast_length(float4 v) { 1176 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1177} 1178 1179extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) { 1180 return fast_length(lhs - rhs); 1181} 1182extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) { 1183 return fast_length(lhs - rhs); 1184} 1185extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) { 1186 return fast_length(lhs - rhs); 1187} 1188extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) { 1189 return fast_length(lhs - rhs); 1190} 1191 1192extern float __attribute__((overloadable)) half_rsqrt(float); 1193 1194/* For the normalization functions, vectors of length 0 should simply be 1195 * returned (i.e. all the components of that vector are 0). 1196 */ 1197extern float __attribute__((overloadable)) fast_normalize(float v) { 1198 if (v == 0.0f) { 1199 return 0.0f; 1200 } else if (v < 0.0f) { 1201 return -1.0f; 1202 } else { 1203 return 1.0f; 1204 } 1205} 1206// If the length is 0, then rlength should be NaN. 1207extern float2 __attribute__((overloadable)) fast_normalize(float2 v) { 1208 float rlength = half_rsqrt(v.x*v.x + v.y*v.y); 1209 return (rlength == rlength) ? v * rlength : v; 1210} 1211extern float3 __attribute__((overloadable)) fast_normalize(float3 v) { 1212 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1213 return (rlength == rlength) ? v * rlength : v; 1214} 1215extern float4 __attribute__((overloadable)) fast_normalize(float4 v) { 1216 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1217 return (rlength == rlength) ? v * rlength : v; 1218} 1219 1220extern float __attribute__((overloadable)) half_recip(float v) { 1221 return 1.f / v; 1222} 1223 1224/* 1225extern float __attribute__((overloadable)) approx_atan(float x) { 1226 if (x == 0.f) 1227 return 0.f; 1228 if (x < 0.f) 1229 return -1.f * approx_atan(-1.f * x); 1230 if (x > 1.f) 1231 return M_PI_2 - approx_atan(approx_recip(x)); 1232 return x * approx_recip(1.f + 0.28f * x*x); 1233} 1234FN_FUNC_FN(approx_atan) 1235*/ 1236 1237typedef union 1238{ 1239 float fv; 1240 int32_t iv; 1241} ieee_float_shape_type; 1242 1243/* Get a 32 bit int from a float. */ 1244 1245#define GET_FLOAT_WORD(i,d) \ 1246do { \ 1247 ieee_float_shape_type gf_u; \ 1248 gf_u.fv = (d); \ 1249 (i) = gf_u.iv; \ 1250} while (0) 1251 1252/* Set a float from a 32 bit int. */ 1253 1254#define SET_FLOAT_WORD(d,i) \ 1255do { \ 1256 ieee_float_shape_type sf_u; \ 1257 sf_u.iv = (i); \ 1258 (d) = sf_u.fv; \ 1259} while (0) 1260 1261 1262 1263// Valid -125 to 125 1264extern float __attribute__((overloadable)) native_exp2(float v) { 1265 int32_t iv = (int)v; 1266 int32_t x = iv + (iv >> 31); // ~floor(v) 1267 float r = (v - x); 1268 1269 float fo; 1270 SET_FLOAT_WORD(fo, (x + 127) << 23); 1271 1272 r *= 0.694f; // ~ log(e) / log(2) 1273 float r2 = r*r; 1274 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1275 return fo * adj; 1276} 1277 1278extern float2 __attribute__((overloadable)) native_exp2(float2 v) { 1279 int2 iv = convert_int2(v); 1280 int2 x = iv + (iv >> (int2)31);//floor(v); 1281 float2 r = (v - convert_float2(x)); 1282 1283 x += 127; 1284 1285 float2 fo = (float2)(x << (int2)23); 1286 1287 r *= 0.694f; // ~ log(e) / log(2) 1288 float2 r2 = r*r; 1289 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1290 return fo * adj; 1291} 1292 1293extern float4 __attribute__((overloadable)) native_exp2(float4 v) { 1294 int4 iv = convert_int4(v); 1295 int4 x = iv + (iv >> (int4)31);//floor(v); 1296 float4 r = (v - convert_float4(x)); 1297 1298 x += 127; 1299 1300 float4 fo = (float4)(x << (int4)23); 1301 1302 r *= 0.694f; // ~ log(e) / log(2) 1303 float4 r2 = r*r; 1304 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1305 return fo * adj; 1306} 1307 1308extern float3 __attribute__((overloadable)) native_exp2(float3 v) { 1309 float4 t = 1.f; 1310 t.xyz = v; 1311 return native_exp2(t).xyz; 1312} 1313 1314 1315extern float __attribute__((overloadable)) native_exp(float v) { 1316 return native_exp2(v * 1.442695041f); 1317} 1318extern float2 __attribute__((overloadable)) native_exp(float2 v) { 1319 return native_exp2(v * 1.442695041f); 1320} 1321extern float3 __attribute__((overloadable)) native_exp(float3 v) { 1322 return native_exp2(v * 1.442695041f); 1323} 1324extern float4 __attribute__((overloadable)) native_exp(float4 v) { 1325 return native_exp2(v * 1.442695041f); 1326} 1327 1328extern float __attribute__((overloadable)) native_exp10(float v) { 1329 return native_exp2(v * 3.321928095f); 1330} 1331extern float2 __attribute__((overloadable)) native_exp10(float2 v) { 1332 return native_exp2(v * 3.321928095f); 1333} 1334extern float3 __attribute__((overloadable)) native_exp10(float3 v) { 1335 return native_exp2(v * 3.321928095f); 1336} 1337extern float4 __attribute__((overloadable)) native_exp10(float4 v) { 1338 return native_exp2(v * 3.321928095f); 1339} 1340 1341extern float __attribute__((overloadable)) native_log2(float v) { 1342 int32_t ibits; 1343 GET_FLOAT_WORD(ibits, v); 1344 1345 int32_t e = (ibits >> 23) & 0xff; 1346 1347 ibits &= 0x7fffff; 1348 ibits |= 127 << 23; 1349 1350 float ir; 1351 SET_FLOAT_WORD(ir, ibits); 1352 ir -= 1.5f; 1353 float ir2 = ir*ir; 1354 float adj2 = (0.405465108f / 0.693147181f) + 1355 ((0.666666667f / 0.693147181f) * ir) - 1356 ((0.222222222f / 0.693147181f) * ir2) + 1357 ((0.098765432f / 0.693147181f) * ir*ir2) - 1358 ((0.049382716f / 0.693147181f) * ir2*ir2) + 1359 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) - 1360 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2); 1361 return (float)(e - 127) + adj2; 1362} 1363extern float2 __attribute__((overloadable)) native_log2(float2 v) { 1364 float2 v2 = {native_log2(v.x), native_log2(v.y)}; 1365 return v2; 1366} 1367extern float3 __attribute__((overloadable)) native_log2(float3 v) { 1368 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)}; 1369 return v2; 1370} 1371extern float4 __attribute__((overloadable)) native_log2(float4 v) { 1372 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)}; 1373 return v2; 1374} 1375 1376extern float __attribute__((overloadable)) native_log(float v) { 1377 return native_log2(v) * (1.f / 1.442695041f); 1378} 1379extern float2 __attribute__((overloadable)) native_log(float2 v) { 1380 return native_log2(v) * (1.f / 1.442695041f); 1381} 1382extern float3 __attribute__((overloadable)) native_log(float3 v) { 1383 return native_log2(v) * (1.f / 1.442695041f); 1384} 1385extern float4 __attribute__((overloadable)) native_log(float4 v) { 1386 return native_log2(v) * (1.f / 1.442695041f); 1387} 1388 1389extern float __attribute__((overloadable)) native_log10(float v) { 1390 return native_log2(v) * (1.f / 3.321928095f); 1391} 1392extern float2 __attribute__((overloadable)) native_log10(float2 v) { 1393 return native_log2(v) * (1.f / 3.321928095f); 1394} 1395extern float3 __attribute__((overloadable)) native_log10(float3 v) { 1396 return native_log2(v) * (1.f / 3.321928095f); 1397} 1398extern float4 __attribute__((overloadable)) native_log10(float4 v) { 1399 return native_log2(v) * (1.f / 3.321928095f); 1400} 1401 1402 1403extern float __attribute__((overloadable)) native_powr(float v, float y) { 1404 float v2 = native_log2(v); 1405 v2 = fmax(v2 * y, -125.f); 1406 return native_exp2(v2); 1407} 1408extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) { 1409 float2 v2 = native_log2(v); 1410 v2 = fmax(v2 * y, -125.f); 1411 return native_exp2(v2); 1412} 1413extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) { 1414 float3 v2 = native_log2(v); 1415 v2 = fmax(v2 * y, -125.f); 1416 return native_exp2(v2); 1417} 1418extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) { 1419 float4 v2 = native_log2(v); 1420 v2 = fmax(v2 * y, -125.f); 1421 return native_exp2(v2); 1422} 1423 1424extern double __attribute__((overloadable)) min(double v1, double v2) { 1425 return v1 < v2 ? v1 : v2; 1426} 1427 1428extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) { 1429 double2 r; 1430 r.x = v1.x < v2.x ? v1.x : v2.x; 1431 r.y = v1.y < v2.y ? v1.y : v2.y; 1432 return r; 1433} 1434 1435extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) { 1436 double3 r; 1437 r.x = v1.x < v2.x ? v1.x : v2.x; 1438 r.y = v1.y < v2.y ? v1.y : v2.y; 1439 r.z = v1.z < v2.z ? v1.z : v2.z; 1440 return r; 1441} 1442 1443extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) { 1444 double4 r; 1445 r.x = v1.x < v2.x ? v1.x : v2.x; 1446 r.y = v1.y < v2.y ? v1.y : v2.y; 1447 r.z = v1.z < v2.z ? v1.z : v2.z; 1448 r.w = v1.w < v2.w ? v1.w : v2.w; 1449 return r; 1450} 1451 1452extern long __attribute__((overloadable)) min(long v1, long v2) { 1453 return v1 < v2 ? v1 : v2; 1454} 1455extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) { 1456 long2 r; 1457 r.x = v1.x < v2.x ? v1.x : v2.x; 1458 r.y = v1.y < v2.y ? v1.y : v2.y; 1459 return r; 1460} 1461extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) { 1462 long3 r; 1463 r.x = v1.x < v2.x ? v1.x : v2.x; 1464 r.y = v1.y < v2.y ? v1.y : v2.y; 1465 r.z = v1.z < v2.z ? v1.z : v2.z; 1466 return r; 1467} 1468extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) { 1469 long4 r; 1470 r.x = v1.x < v2.x ? v1.x : v2.x; 1471 r.y = v1.y < v2.y ? v1.y : v2.y; 1472 r.z = v1.z < v2.z ? v1.z : v2.z; 1473 r.w = v1.w < v2.w ? v1.w : v2.w; 1474 return r; 1475} 1476 1477extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) { 1478 return v1 < v2 ? v1 : v2; 1479} 1480extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) { 1481 ulong2 r; 1482 r.x = v1.x < v2.x ? v1.x : v2.x; 1483 r.y = v1.y < v2.y ? v1.y : v2.y; 1484 return r; 1485} 1486extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) { 1487 ulong3 r; 1488 r.x = v1.x < v2.x ? v1.x : v2.x; 1489 r.y = v1.y < v2.y ? v1.y : v2.y; 1490 r.z = v1.z < v2.z ? v1.z : v2.z; 1491 return r; 1492} 1493extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) { 1494 ulong4 r; 1495 r.x = v1.x < v2.x ? v1.x : v2.x; 1496 r.y = v1.y < v2.y ? v1.y : v2.y; 1497 r.z = v1.z < v2.z ? v1.z : v2.z; 1498 r.w = v1.w < v2.w ? v1.w : v2.w; 1499 return r; 1500} 1501 1502extern double __attribute__((overloadable)) max(double v1, double v2) { 1503 return v1 > v2 ? v1 : v2; 1504} 1505 1506extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) { 1507 double2 r; 1508 r.x = v1.x > v2.x ? v1.x : v2.x; 1509 r.y = v1.y > v2.y ? v1.y : v2.y; 1510 return r; 1511} 1512 1513extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) { 1514 double3 r; 1515 r.x = v1.x > v2.x ? v1.x : v2.x; 1516 r.y = v1.y > v2.y ? v1.y : v2.y; 1517 r.z = v1.z > v2.z ? v1.z : v2.z; 1518 return r; 1519} 1520 1521extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) { 1522 double4 r; 1523 r.x = v1.x > v2.x ? v1.x : v2.x; 1524 r.y = v1.y > v2.y ? v1.y : v2.y; 1525 r.z = v1.z > v2.z ? v1.z : v2.z; 1526 r.w = v1.w > v2.w ? v1.w : v2.w; 1527 return r; 1528} 1529 1530extern long __attribute__((overloadable)) max(long v1, long v2) { 1531 return v1 > v2 ? v1 : v2; 1532} 1533extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) { 1534 long2 r; 1535 r.x = v1.x > v2.x ? v1.x : v2.x; 1536 r.y = v1.y > v2.y ? v1.y : v2.y; 1537 return r; 1538} 1539extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) { 1540 long3 r; 1541 r.x = v1.x > v2.x ? v1.x : v2.x; 1542 r.y = v1.y > v2.y ? v1.y : v2.y; 1543 r.z = v1.z > v2.z ? v1.z : v2.z; 1544 return r; 1545} 1546extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) { 1547 long4 r; 1548 r.x = v1.x > v2.x ? v1.x : v2.x; 1549 r.y = v1.y > v2.y ? v1.y : v2.y; 1550 r.z = v1.z > v2.z ? v1.z : v2.z; 1551 r.w = v1.w > v2.w ? v1.w : v2.w; 1552 return r; 1553} 1554 1555extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) { 1556 return v1 > v2 ? v1 : v2; 1557} 1558extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) { 1559 ulong2 r; 1560 r.x = v1.x > v2.x ? v1.x : v2.x; 1561 r.y = v1.y > v2.y ? v1.y : v2.y; 1562 return r; 1563} 1564extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) { 1565 ulong3 r; 1566 r.x = v1.x > v2.x ? v1.x : v2.x; 1567 r.y = v1.y > v2.y ? v1.y : v2.y; 1568 r.z = v1.z > v2.z ? v1.z : v2.z; 1569 return r; 1570} 1571extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) { 1572 ulong4 r; 1573 r.x = v1.x > v2.x ? v1.x : v2.x; 1574 r.y = v1.y > v2.y ? v1.y : v2.y; 1575 r.z = v1.z > v2.z ? v1.z : v2.z; 1576 r.w = v1.w > v2.w ? v1.w : v2.w; 1577 return r; 1578} 1579 1580#define THUNK_NATIVE_F(fn) \ 1581 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \ 1582 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \ 1583 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \ 1584 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);} 1585 1586#define THUNK_NATIVE_F_F(fn) \ 1587 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \ 1588 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \ 1589 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \ 1590 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);} 1591 1592#define THUNK_NATIVE_F_FP(fn) \ 1593 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \ 1594 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \ 1595 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \ 1596 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);} 1597 1598#define THUNK_NATIVE_F_I(fn) \ 1599 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \ 1600 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \ 1601 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \ 1602 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);} 1603 1604THUNK_NATIVE_F(acos) 1605THUNK_NATIVE_F(acosh) 1606THUNK_NATIVE_F(acospi) 1607THUNK_NATIVE_F(asin) 1608THUNK_NATIVE_F(asinh) 1609THUNK_NATIVE_F(asinpi) 1610THUNK_NATIVE_F(atan) 1611THUNK_NATIVE_F_F(atan2) 1612THUNK_NATIVE_F(atanh) 1613THUNK_NATIVE_F(atanpi) 1614THUNK_NATIVE_F_F(atan2pi) 1615THUNK_NATIVE_F(cbrt) 1616THUNK_NATIVE_F(cos) 1617THUNK_NATIVE_F(cosh) 1618THUNK_NATIVE_F(cospi) 1619THUNK_NATIVE_F(expm1) 1620THUNK_NATIVE_F_F(hypot) 1621THUNK_NATIVE_F(log1p) 1622THUNK_NATIVE_F_I(rootn) 1623THUNK_NATIVE_F(rsqrt) 1624THUNK_NATIVE_F(sqrt) 1625THUNK_NATIVE_F(sin) 1626THUNK_NATIVE_F_FP(sincos) 1627THUNK_NATIVE_F(sinh) 1628THUNK_NATIVE_F(sinpi) 1629THUNK_NATIVE_F(tan) 1630THUNK_NATIVE_F(tanh) 1631THUNK_NATIVE_F(tanpi) 1632 1633#undef THUNK_NATIVE_F 1634#undef THUNK_NATIVE_F_F 1635#undef THUNK_NATIVE_F_I 1636#undef THUNK_NATIVE_F_FP 1637 1638float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);} 1639float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);} 1640float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);} 1641float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);} 1642 1643float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);} 1644float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);} 1645float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);} 1646float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);} 1647 1648float __attribute__((overloadable)) native_length(float v) { return fast_length(v);} 1649float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);} 1650float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);} 1651float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);} 1652 1653float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;} 1654float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;} 1655float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;} 1656float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;} 1657 1658float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;} 1659float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;} 1660float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;} 1661float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;} 1662 1663 1664 1665 1666 1667#undef FN_FUNC_FN 1668#undef IN_FUNC_FN 1669#undef FN_FUNC_FN_FN 1670#undef FN_FUNC_FN_F 1671#undef FN_FUNC_FN_IN 1672#undef FN_FUNC_FN_I 1673#undef FN_FUNC_FN_PFN 1674#undef FN_FUNC_FN_PIN 1675#undef FN_FUNC_FN_FN_FN 1676#undef FN_FUNC_FN_FN_PIN 1677#undef XN_FUNC_YN 1678#undef UIN_FUNC_IN 1679#undef IN_FUNC_IN 1680#undef XN_FUNC_XN_XN_BODY 1681#undef IN_FUNC_IN_IN_BODY 1682 1683static const unsigned short kHalfPositiveInfinity = 0x7c00; 1684 1685/* Define f16 functions of the form 1686 * HN output = fn(HN input) 1687 * where HN is scalar or vector half type 1688 */ 1689#define HN_FUNC_HN(fn) \ 1690extern half __attribute__((overloadable)) fn(half h) { \ 1691 return (half) fn((float) h); \ 1692} \ 1693extern half2 __attribute__((overloadable)) fn(half2 v) { \ 1694 return convert_half2(fn(convert_float2(v))); \ 1695} \ 1696extern half3 __attribute__((overloadable)) fn(half3 v) { \ 1697 return convert_half3(fn(convert_float3(v))); \ 1698} \ 1699extern half4 __attribute__((overloadable)) fn(half4 v) { \ 1700 return convert_half4(fn(convert_float4(v))); \ 1701} 1702 1703/* Define f16 functions of the form 1704 * HN output = fn(HN input1, HN input2) 1705 * where HN is scalar or vector half type 1706 */ 1707#define HN_FUNC_HN_HN(fn) \ 1708extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1709 return (half) fn((float) h1, (float) h2); \ 1710} \ 1711extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1712 return convert_half2(fn(convert_float2(v1), \ 1713 convert_float2(v2))); \ 1714} \ 1715extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1716 return convert_half3(fn(convert_float3(v1), \ 1717 convert_float3(v2))); \ 1718} \ 1719extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1720 return convert_half4(fn(convert_float4(v1), \ 1721 convert_float4(v2))); \ 1722} 1723 1724/* Define f16 functions of the form 1725 * HN output = fn(HN input1, half input2) 1726 * where HN is scalar or vector half type 1727 */ 1728#define HN_FUNC_HN_H(fn) \ 1729extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \ 1730 return convert_half2(fn(convert_float2(v1), (float) v2)); \ 1731} \ 1732extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \ 1733 return convert_half3(fn(convert_float3(v1), (float) v2)); \ 1734} \ 1735extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \ 1736 return convert_half4(fn(convert_float4(v1), (float) v2)); \ 1737} 1738 1739/* Define f16 functions of the form 1740 * HN output = fn(HN input1, HN input2, HN input3) 1741 * where HN is scalar or vector half type 1742 */ 1743#define HN_FUNC_HN_HN_HN(fn) \ 1744extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \ 1745 return (half) fn((float) h1, (float) h2, (float) h3); \ 1746} \ 1747extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \ 1748 return convert_half2(fn(convert_float2(v1), \ 1749 convert_float2(v2), \ 1750 convert_float2(v3))); \ 1751} \ 1752extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \ 1753 return convert_half3(fn(convert_float3(v1), \ 1754 convert_float3(v2), \ 1755 convert_float3(v3))); \ 1756} \ 1757extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \ 1758 return convert_half4(fn(convert_float4(v1), \ 1759 convert_float4(v2), \ 1760 convert_float4(v3))); \ 1761} 1762 1763/* Define f16 functions of the form 1764 * HN output = fn(HN input1, IN input2) 1765 * where HN is scalar or vector half type and IN the equivalent integer type 1766 * of same vector length. 1767 */ 1768#define HN_FUNC_HN_IN(fn) \ 1769extern half __attribute__((overloadable)) fn(half h1, int v) { \ 1770 return (half) fn((float) h1, v); \ 1771} \ 1772extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \ 1773 return convert_half2(fn(convert_float2(v1), v2)); \ 1774} \ 1775extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \ 1776 return convert_half3(fn(convert_float3(v1), v2)); \ 1777} \ 1778extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \ 1779 return convert_half4(fn(convert_float4(v1), v2)); \ 1780} 1781 1782/* Define f16 functions of the form 1783 * half output = fn(HN input1) 1784 * where HN is a scalar or vector half type. 1785 */ 1786#define H_FUNC_HN(fn) \ 1787extern half __attribute__((overloadable)) fn(half h) { \ 1788 return (half) fn((float) h); \ 1789} \ 1790extern half __attribute__((overloadable)) fn(half2 v) { \ 1791 return fn(convert_float2(v)); \ 1792} \ 1793extern half __attribute__((overloadable)) fn(half3 v) { \ 1794 return fn(convert_float3(v)); \ 1795} \ 1796extern half __attribute__((overloadable)) fn(half4 v) { \ 1797 return fn(convert_float4(v)); \ 1798} 1799 1800/* Define f16 functions of the form 1801 * half output = fn(HN input1, HN input2) 1802 * where HN is a scalar or vector half type. 1803 */ 1804#define H_FUNC_HN_HN(fn) \ 1805extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1806 return (half) fn((float) h1, (float) h2); \ 1807} \ 1808extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1809 return fn(convert_float2(v1), convert_float2(v2)); \ 1810} \ 1811extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1812 return fn(convert_float3(v1), convert_float3(v2)); \ 1813} \ 1814extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1815 return fn(convert_float4(v1), convert_float4(v2)); \ 1816} 1817 1818#define SCALARIZE_HN_FUNC_HN_PHN(fnc) \ 1819extern half2 __attribute__((overloadable)) fnc(half2 v1, half2 *v2) { \ 1820 half2 ret; \ 1821 half t[2]; \ 1822 ret.x = fnc(v1.x, &t[0]); \ 1823 ret.y = fnc(v1.y, &t[1]); \ 1824 v2->x = t[0]; \ 1825 v2->y = t[1]; \ 1826 return ret; \ 1827} \ 1828extern half3 __attribute__((overloadable)) fnc(half3 v1, half3 *v2) { \ 1829 half3 ret; \ 1830 half t[3]; \ 1831 ret.x = fnc(v1.x, &t[0]); \ 1832 ret.y = fnc(v1.y, &t[1]); \ 1833 ret.z = fnc(v1.z, &t[2]); \ 1834 v2->x = t[0]; \ 1835 v2->y = t[1]; \ 1836 v2->z = t[2]; \ 1837 return ret; \ 1838} \ 1839extern half4 __attribute__((overloadable)) fnc(half4 v1, half4 *v2) { \ 1840 half4 ret; \ 1841 half t[4]; \ 1842 ret.x = fnc(v1.x, &t[0]); \ 1843 ret.y = fnc(v1.y, &t[1]); \ 1844 ret.z = fnc(v1.z, &t[2]); \ 1845 ret.w = fnc(v1.w, &t[3]); \ 1846 v2->x = t[0]; \ 1847 v2->y = t[1]; \ 1848 v2->z = t[2]; \ 1849 v2->w = t[3]; \ 1850 return ret; \ 1851} 1852 1853/* Define f16 functions of the form 1854 * HN output = fn(HN input1, HN input2) 1855 * where HN is a vector half type. The functions are defined to call the 1856 * scalar function of the same name. 1857 */ 1858#define SCALARIZE_HN_FUNC_HN_HN(fn) \ 1859extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1860 half2 ret; \ 1861 ret.x = fn(v1.x, v2.x); \ 1862 ret.y = fn(v1.y, v2.y); \ 1863 return ret; \ 1864} \ 1865extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1866 half3 ret; \ 1867 ret.x = fn(v1.x, v2.x); \ 1868 ret.y = fn(v1.y, v2.y); \ 1869 ret.z = fn(v1.z, v2.z); \ 1870 return ret; \ 1871} \ 1872extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1873 half4 ret; \ 1874 ret.x = fn(v1.x, v2.x); \ 1875 ret.y = fn(v1.y, v2.y); \ 1876 ret.z = fn(v1.z, v2.z); \ 1877 ret.w = fn(v1.w, v2.w); \ 1878 return ret; \ 1879} \ 1880 1881HN_FUNC_HN(acos); 1882HN_FUNC_HN(acosh); 1883HN_FUNC_HN(acospi); 1884HN_FUNC_HN(asin); 1885HN_FUNC_HN(asinh); 1886HN_FUNC_HN(asinpi); 1887HN_FUNC_HN(atan); 1888HN_FUNC_HN(atanh); 1889HN_FUNC_HN(atanpi); 1890HN_FUNC_HN_HN(atan2); 1891HN_FUNC_HN_HN(atan2pi); 1892 1893HN_FUNC_HN(cbrt); 1894HN_FUNC_HN(ceil); 1895 1896extern half __attribute__((overloadable)) copysign(half x, half y); 1897SCALARIZE_HN_FUNC_HN_HN(copysign); 1898 1899HN_FUNC_HN(cos); 1900HN_FUNC_HN(cosh); 1901HN_FUNC_HN(cospi); 1902 1903extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) { 1904 half3 r; 1905 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1906 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1907 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1908 return r; 1909} 1910 1911extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) { 1912 half4 r; 1913 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1914 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1915 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1916 r.w = 0.f; 1917 return r; 1918} 1919 1920HN_FUNC_HN(degrees); 1921H_FUNC_HN_HN(distance); 1922H_FUNC_HN_HN(dot); 1923 1924HN_FUNC_HN(erf); 1925HN_FUNC_HN(erfc); 1926HN_FUNC_HN(exp); 1927HN_FUNC_HN(exp10); 1928HN_FUNC_HN(exp2); 1929HN_FUNC_HN(expm1); 1930 1931HN_FUNC_HN(fabs); 1932HN_FUNC_HN_HN(fdim); 1933HN_FUNC_HN(floor); 1934HN_FUNC_HN_HN_HN(fma); 1935HN_FUNC_HN_HN(fmax); 1936HN_FUNC_HN_H(fmax); 1937HN_FUNC_HN_HN(fmin); 1938HN_FUNC_HN_H(fmin); 1939HN_FUNC_HN_HN(fmod); 1940 1941extern half __attribute__((overloadable)) fract(half v, half *iptr) { 1942 // maxLessThanOne = 0.99951171875, the largest value < 1.0 1943 half maxLessThanOne; 1944 SET_HALF_WORD(maxLessThanOne, 0x3bff); 1945 1946 int i = (int) floor(v); 1947 if (iptr) { 1948 *iptr = i; 1949 } 1950 // return v - floor(v), if strictly less than one 1951 return fmin(v - i, maxLessThanOne); 1952} 1953 1954SCALARIZE_HN_FUNC_HN_PHN(fract); 1955 1956extern half __attribute__((const, overloadable)) fract(half v) { 1957 half unused; 1958 return fract(v, &unused); 1959} 1960 1961extern half2 __attribute__((const, overloadable)) fract(half2 v) { 1962 half2 unused; 1963 return fract(v, &unused); 1964} 1965 1966extern half3 __attribute__((const, overloadable)) fract(half3 v) { 1967 half3 unused; 1968 return fract(v, &unused); 1969} 1970 1971extern half4 __attribute__((const, overloadable)) fract(half4 v) { 1972 half4 unused; 1973 return fract(v, &unused); 1974} 1975 1976extern half __attribute__((overloadable)) frexp(half x, int *eptr); 1977 1978extern half2 __attribute__((overloadable)) frexp(half2 v1, int2 *eptr) { 1979 half2 ret; 1980 int e[2]; 1981 ret.x = frexp(v1.x, &e[0]); 1982 ret.y = frexp(v1.y, &e[1]); 1983 eptr->x = e[0]; 1984 eptr->y = e[1]; 1985 return ret; 1986} 1987 1988extern half3 __attribute__((overloadable)) frexp(half3 v1, int3 *eptr) { 1989 half3 ret; 1990 int e[3]; 1991 ret.x = frexp(v1.x, &e[0]); 1992 ret.y = frexp(v1.y, &e[1]); 1993 ret.z = frexp(v1.z, &e[2]); 1994 eptr->x = e[0]; 1995 eptr->y = e[1]; 1996 eptr->z = e[2]; 1997 return ret; 1998} 1999 2000extern half4 __attribute__((overloadable)) frexp(half4 v1, int4 *eptr) { 2001 half4 ret; 2002 int e[4]; 2003 ret.x = frexp(v1.x, &e[0]); 2004 ret.y = frexp(v1.y, &e[1]); 2005 ret.z = frexp(v1.z, &e[2]); 2006 ret.w = frexp(v1.w, &e[3]); 2007 eptr->x = e[0]; 2008 eptr->y = e[1]; 2009 eptr->z = e[2]; 2010 eptr->w = e[3]; 2011 return ret; 2012} 2013 2014HN_FUNC_HN_HN(hypot); 2015 2016extern int __attribute__((overloadable)) ilogb(half x); 2017 2018extern int2 __attribute__((overloadable)) ilogb(half2 v) { 2019 int2 ret; 2020 ret.x = ilogb(v.x); 2021 ret.y = ilogb(v.y); 2022 return ret; 2023} 2024extern int3 __attribute__((overloadable)) ilogb(half3 v) { 2025 int3 ret; 2026 ret.x = ilogb(v.x); 2027 ret.y = ilogb(v.y); 2028 ret.z = ilogb(v.z); 2029 return ret; 2030} 2031extern int4 __attribute__((overloadable)) ilogb(half4 v) { 2032 int4 ret; 2033 ret.x = ilogb(v.x); 2034 ret.y = ilogb(v.y); 2035 ret.z = ilogb(v.z); 2036 ret.w = ilogb(v.w); 2037 return ret; 2038} 2039 2040HN_FUNC_HN_IN(ldexp); 2041extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) { 2042 return convert_half2(ldexp(convert_float2(v), exponent)); 2043} 2044extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) { 2045 return convert_half3(ldexp(convert_float3(v), exponent)); 2046} 2047extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) { 2048 return convert_half4(ldexp(convert_float4(v), exponent)); 2049} 2050 2051H_FUNC_HN(length); 2052HN_FUNC_HN(lgamma); 2053 2054extern half __attribute__((overloadable)) lgamma(half h, int *signp) { 2055 return (half) lgamma((float) h, signp); 2056} 2057extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) { 2058 return convert_half2(lgamma(convert_float2(v), signp)); 2059} 2060extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) { 2061 return convert_half3(lgamma(convert_float3(v), signp)); 2062} 2063extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) { 2064 return convert_half4(lgamma(convert_float4(v), signp)); 2065} 2066 2067HN_FUNC_HN(log); 2068HN_FUNC_HN(log10); 2069HN_FUNC_HN(log1p); 2070HN_FUNC_HN(log2); 2071HN_FUNC_HN(logb); 2072 2073HN_FUNC_HN_HN_HN(mad); 2074HN_FUNC_HN_HN(max); 2075HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff? 2076HN_FUNC_HN_HN(min); 2077HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff? 2078 2079extern half __attribute__((overloadable)) mix(half start, half stop, half amount) { 2080 return start + (stop - start) * amount; 2081} 2082extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) { 2083 return start + (stop - start) * amount; 2084} 2085extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) { 2086 return start + (stop - start) * amount; 2087} 2088extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) { 2089 return start + (stop - start) * amount; 2090} 2091extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) { 2092 return start + (stop - start) * amount; 2093} 2094extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) { 2095 return start + (stop - start) * amount; 2096} 2097extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) { 2098 return start + (stop - start) * amount; 2099} 2100 2101extern half __attribute__((overloadable)) modf(half x, half *iptr); 2102SCALARIZE_HN_FUNC_HN_PHN(modf); 2103 2104half __attribute__((overloadable)) nan_half() { 2105 unsigned short nan_short = kHalfPositiveInfinity | 0x0200; 2106 half nan; 2107 SET_HALF_WORD(nan, nan_short); 2108 return nan; 2109} 2110 2111HN_FUNC_HN(normalize); 2112 2113extern half __attribute__((overloadable)) nextafter(half x, half y); 2114SCALARIZE_HN_FUNC_HN_HN(nextafter); 2115 2116HN_FUNC_HN_HN(pow); 2117HN_FUNC_HN_IN(pown); 2118HN_FUNC_HN_HN(powr); 2119HN_FUNC_HN(radians); 2120HN_FUNC_HN_HN(remainder); 2121 2122extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) { 2123 return (float) remquo((float) n, (float) d, quo); 2124} 2125extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) { 2126 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo)); 2127} 2128extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) { 2129 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo)); 2130} 2131extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) { 2132 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo)); 2133} 2134 2135HN_FUNC_HN(rint); 2136HN_FUNC_HN_IN(rootn); 2137HN_FUNC_HN(round); 2138HN_FUNC_HN(rsqrt); 2139 2140extern half __attribute__((overloadable)) sign(half h) { 2141 if (h > 0) return (half) 1.f; 2142 if (h < 0) return (half) -1.f; 2143 return h; 2144} 2145extern half2 __attribute__((overloadable)) sign(half2 v) { 2146 half2 ret; 2147 ret.x = sign(v.x); 2148 ret.y = sign(v.y); 2149 return ret; 2150} 2151extern half3 __attribute__((overloadable)) sign(half3 v) { 2152 half3 ret; 2153 ret.x = sign(v.x); 2154 ret.y = sign(v.y); 2155 ret.z = sign(v.z); 2156 return ret; 2157} 2158extern half4 __attribute__((overloadable)) sign(half4 v) { 2159 half4 ret; 2160 ret.x = sign(v.x); 2161 ret.y = sign(v.y); 2162 ret.z = sign(v.z); 2163 ret.w = sign(v.w); 2164 return ret; 2165} 2166 2167HN_FUNC_HN(sin); 2168 2169extern half __attribute__((overloadable)) sincos(half v, half *cosptr) { 2170 *cosptr = cos(v); 2171 return sin(v); 2172} 2173// TODO verify if LLVM eliminates the duplicate convert_float2 2174extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) { 2175 *cosptr = cos(v); 2176 return sin(v); 2177} 2178extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) { 2179 *cosptr = cos(v); 2180 return sin(v); 2181} 2182extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) { 2183 *cosptr = cos(v); 2184 return sin(v); 2185} 2186 2187HN_FUNC_HN(sinh); 2188HN_FUNC_HN(sinpi); 2189HN_FUNC_HN(sqrt); 2190 2191extern half __attribute__((overloadable)) step(half edge, half v) { 2192 return (v < edge) ? 0.f : 1.f; 2193} 2194extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) { 2195 half2 r; 2196 r.x = (v.x < edge.x) ? 0.f : 1.f; 2197 r.y = (v.y < edge.y) ? 0.f : 1.f; 2198 return r; 2199} 2200extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) { 2201 half3 r; 2202 r.x = (v.x < edge.x) ? 0.f : 1.f; 2203 r.y = (v.y < edge.y) ? 0.f : 1.f; 2204 r.z = (v.z < edge.z) ? 0.f : 1.f; 2205 return r; 2206} 2207extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) { 2208 half4 r; 2209 r.x = (v.x < edge.x) ? 0.f : 1.f; 2210 r.y = (v.y < edge.y) ? 0.f : 1.f; 2211 r.z = (v.z < edge.z) ? 0.f : 1.f; 2212 r.w = (v.w < edge.w) ? 0.f : 1.f; 2213 return r; 2214} 2215extern half2 __attribute__((overloadable)) step(half2 edge, half v) { 2216 half2 r; 2217 r.x = (v < edge.x) ? 0.f : 1.f; 2218 r.y = (v < edge.y) ? 0.f : 1.f; 2219 return r; 2220} 2221extern half3 __attribute__((overloadable)) step(half3 edge, half v) { 2222 half3 r; 2223 r.x = (v < edge.x) ? 0.f : 1.f; 2224 r.y = (v < edge.y) ? 0.f : 1.f; 2225 r.z = (v < edge.z) ? 0.f : 1.f; 2226 return r; 2227} 2228extern half4 __attribute__((overloadable)) step(half4 edge, half v) { 2229 half4 r; 2230 r.x = (v < edge.x) ? 0.f : 1.f; 2231 r.y = (v < edge.y) ? 0.f : 1.f; 2232 r.z = (v < edge.z) ? 0.f : 1.f; 2233 r.w = (v < edge.w) ? 0.f : 1.f; 2234 return r; 2235} 2236extern half2 __attribute__((overloadable)) step(half edge, half2 v) { 2237 half2 r; 2238 r.x = (v.x < edge) ? 0.f : 1.f; 2239 r.y = (v.y < edge) ? 0.f : 1.f; 2240 return r; 2241} 2242extern half3 __attribute__((overloadable)) step(half edge, half3 v) { 2243 half3 r; 2244 r.x = (v.x < edge) ? 0.f : 1.f; 2245 r.y = (v.y < edge) ? 0.f : 1.f; 2246 r.z = (v.z < edge) ? 0.f : 1.f; 2247 return r; 2248} 2249extern half4 __attribute__((overloadable)) step(half edge, half4 v) { 2250 half4 r; 2251 r.x = (v.x < edge) ? 0.f : 1.f; 2252 r.y = (v.y < edge) ? 0.f : 1.f; 2253 r.z = (v.z < edge) ? 0.f : 1.f; 2254 r.w = (v.w < edge) ? 0.f : 1.f; 2255 return r; 2256} 2257 2258HN_FUNC_HN(tan); 2259HN_FUNC_HN(tanh); 2260HN_FUNC_HN(tanpi); 2261HN_FUNC_HN(tgamma); 2262HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation? 2263 2264HN_FUNC_HN(native_acos); 2265HN_FUNC_HN(native_acosh); 2266HN_FUNC_HN(native_acospi); 2267HN_FUNC_HN(native_asin); 2268HN_FUNC_HN(native_asinh); 2269HN_FUNC_HN(native_asinpi); 2270HN_FUNC_HN(native_atan); 2271HN_FUNC_HN(native_atanh); 2272HN_FUNC_HN(native_atanpi); 2273HN_FUNC_HN_HN(native_atan2); 2274HN_FUNC_HN_HN(native_atan2pi); 2275 2276HN_FUNC_HN(native_cbrt); 2277HN_FUNC_HN(native_cos); 2278HN_FUNC_HN(native_cosh); 2279HN_FUNC_HN(native_cospi); 2280 2281H_FUNC_HN_HN(native_distance); 2282HN_FUNC_HN_HN(native_divide); 2283 2284HN_FUNC_HN(native_exp); 2285HN_FUNC_HN(native_exp10); 2286HN_FUNC_HN(native_exp2); 2287HN_FUNC_HN(native_expm1); 2288 2289HN_FUNC_HN_HN(native_hypot); 2290H_FUNC_HN(native_length); 2291 2292HN_FUNC_HN(native_log); 2293HN_FUNC_HN(native_log10); 2294HN_FUNC_HN(native_log1p); 2295HN_FUNC_HN(native_log2); 2296 2297HN_FUNC_HN(native_normalize); 2298 2299HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half? 2300 2301HN_FUNC_HN(native_recip); 2302HN_FUNC_HN_IN(native_rootn); 2303HN_FUNC_HN(native_rsqrt); 2304 2305HN_FUNC_HN(native_sin); 2306 2307extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) { 2308 return sincos(v, cosptr); 2309} 2310extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) { 2311 return sincos(v, cosptr); 2312} 2313extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) { 2314 return sincos(v, cosptr); 2315} 2316extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) { 2317 return sincos(v, cosptr); 2318} 2319 2320HN_FUNC_HN(native_sinh); 2321HN_FUNC_HN(native_sinpi); 2322HN_FUNC_HN(native_sqrt); 2323 2324HN_FUNC_HN(native_tan); 2325HN_FUNC_HN(native_tanh); 2326HN_FUNC_HN(native_tanpi); 2327 2328#undef HN_FUNC_HN 2329#undef HN_FUNC_HN_HN 2330#undef HN_FUNC_HN_H 2331#undef HN_FUNC_HN_HN_HN 2332#undef HN_FUNC_HN_IN 2333#undef H_FUNC_HN 2334#undef H_FUNC_HN_HN 2335#undef SCALARIZE_HN_FUNC_HN_HN 2336 2337// exports unavailable mathlib functions to compat lib 2338 2339#ifdef RS_COMPATIBILITY_LIB 2340 2341// !!! DANGER !!! 2342// These functions are potentially missing on older Android versions. 2343// Work around the issue by supplying our own variants. 2344// !!! DANGER !!! 2345 2346// The logbl() implementation is taken from the latest bionic/, since 2347// double == long double on Android. 2348extern "C" long double logbl(long double x) { return logb(x); } 2349 2350// __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just 2351// pick the simplest implementation based on the ARM EABI doc. 2352extern "C" int __aeabi_idiv0(int v) { return v; } 2353 2354#endif // compatibility lib 2355