rs_cl.c revision e0f52d61fe6790093ef19c734dda2e2c9c0c0fbf
1#include "rs_core.rsh" 2 3extern float2 __attribute__((overloadable)) convert_float2(int2 c); 4extern float3 __attribute__((overloadable)) convert_float3(int3 c); 5extern float4 __attribute__((overloadable)) convert_float4(int4 c); 6 7extern int2 __attribute__((overloadable)) convert_int2(float2 c); 8extern int3 __attribute__((overloadable)) convert_int3(float3 c); 9extern int4 __attribute__((overloadable)) convert_int4(float4 c); 10 11 12extern float __attribute__((overloadable)) fmin(float v, float v2); 13extern float2 __attribute__((overloadable)) fmin(float2 v, float v2); 14extern float3 __attribute__((overloadable)) fmin(float3 v, float v2); 15extern float4 __attribute__((overloadable)) fmin(float4 v, float v2); 16 17extern float __attribute__((overloadable)) fmax(float v, float v2); 18extern float2 __attribute__((overloadable)) fmax(float2 v, float v2); 19extern float3 __attribute__((overloadable)) fmax(float3 v, float v2); 20extern float4 __attribute__((overloadable)) fmax(float4 v, float v2); 21 22// Float ops, 6.11.2 23 24#define FN_FUNC_FN(fnc) \ 25extern float2 __attribute__((overloadable)) fnc(float2 v) { \ 26 float2 r; \ 27 r.x = fnc(v.x); \ 28 r.y = fnc(v.y); \ 29 return r; \ 30} \ 31extern float3 __attribute__((overloadable)) fnc(float3 v) { \ 32 float3 r; \ 33 r.x = fnc(v.x); \ 34 r.y = fnc(v.y); \ 35 r.z = fnc(v.z); \ 36 return r; \ 37} \ 38extern float4 __attribute__((overloadable)) fnc(float4 v) { \ 39 float4 r; \ 40 r.x = fnc(v.x); \ 41 r.y = fnc(v.y); \ 42 r.z = fnc(v.z); \ 43 r.w = fnc(v.w); \ 44 return r; \ 45} 46 47#define IN_FUNC_FN(fnc) \ 48extern int2 __attribute__((overloadable)) fnc(float2 v) { \ 49 int2 r; \ 50 r.x = fnc(v.x); \ 51 r.y = fnc(v.y); \ 52 return r; \ 53} \ 54extern int3 __attribute__((overloadable)) fnc(float3 v) { \ 55 int3 r; \ 56 r.x = fnc(v.x); \ 57 r.y = fnc(v.y); \ 58 r.z = fnc(v.z); \ 59 return r; \ 60} \ 61extern int4 __attribute__((overloadable)) fnc(float4 v) { \ 62 int4 r; \ 63 r.x = fnc(v.x); \ 64 r.y = fnc(v.y); \ 65 r.z = fnc(v.z); \ 66 r.w = fnc(v.w); \ 67 return r; \ 68} 69 70#define FN_FUNC_FN_FN(fnc) \ 71extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \ 72 float2 r; \ 73 r.x = fnc(v1.x, v2.x); \ 74 r.y = fnc(v1.y, v2.y); \ 75 return r; \ 76} \ 77extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \ 78 float3 r; \ 79 r.x = fnc(v1.x, v2.x); \ 80 r.y = fnc(v1.y, v2.y); \ 81 r.z = fnc(v1.z, v2.z); \ 82 return r; \ 83} \ 84extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \ 85 float4 r; \ 86 r.x = fnc(v1.x, v2.x); \ 87 r.y = fnc(v1.y, v2.y); \ 88 r.z = fnc(v1.z, v2.z); \ 89 r.w = fnc(v1.w, v2.w); \ 90 return r; \ 91} 92 93#define FN_FUNC_FN_F(fnc) \ 94extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \ 95 float2 r; \ 96 r.x = fnc(v1.x, v2); \ 97 r.y = fnc(v1.y, v2); \ 98 return r; \ 99} \ 100extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \ 101 float3 r; \ 102 r.x = fnc(v1.x, v2); \ 103 r.y = fnc(v1.y, v2); \ 104 r.z = fnc(v1.z, v2); \ 105 return r; \ 106} \ 107extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \ 108 float4 r; \ 109 r.x = fnc(v1.x, v2); \ 110 r.y = fnc(v1.y, v2); \ 111 r.z = fnc(v1.z, v2); \ 112 r.w = fnc(v1.w, v2); \ 113 return r; \ 114} 115 116#define FN_FUNC_FN_IN(fnc) \ 117extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \ 118 float2 r; \ 119 r.x = fnc(v1.x, v2.x); \ 120 r.y = fnc(v1.y, v2.y); \ 121 return r; \ 122} \ 123extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \ 124 float3 r; \ 125 r.x = fnc(v1.x, v2.x); \ 126 r.y = fnc(v1.y, v2.y); \ 127 r.z = fnc(v1.z, v2.z); \ 128 return r; \ 129} \ 130extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \ 131 float4 r; \ 132 r.x = fnc(v1.x, v2.x); \ 133 r.y = fnc(v1.y, v2.y); \ 134 r.z = fnc(v1.z, v2.z); \ 135 r.w = fnc(v1.w, v2.w); \ 136 return r; \ 137} 138 139#define FN_FUNC_FN_I(fnc) \ 140extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \ 141 float2 r; \ 142 r.x = fnc(v1.x, v2); \ 143 r.y = fnc(v1.y, v2); \ 144 return r; \ 145} \ 146extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \ 147 float3 r; \ 148 r.x = fnc(v1.x, v2); \ 149 r.y = fnc(v1.y, v2); \ 150 r.z = fnc(v1.z, v2); \ 151 return r; \ 152} \ 153extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \ 154 float4 r; \ 155 r.x = fnc(v1.x, v2); \ 156 r.y = fnc(v1.y, v2); \ 157 r.z = fnc(v1.z, v2); \ 158 r.w = fnc(v1.w, v2); \ 159 return r; \ 160} 161 162#define FN_FUNC_FN_PFN(fnc) \ 163extern float2 __attribute__((overloadable)) \ 164 fnc(float2 v1, float2 *v2) { \ 165 float2 r; \ 166 float t[2]; \ 167 r.x = fnc(v1.x, &t[0]); \ 168 r.y = fnc(v1.y, &t[1]); \ 169 v2->x = t[0]; \ 170 v2->y = t[1]; \ 171 return r; \ 172} \ 173extern float3 __attribute__((overloadable)) \ 174 fnc(float3 v1, float3 *v2) { \ 175 float3 r; \ 176 float t[3]; \ 177 r.x = fnc(v1.x, &t[0]); \ 178 r.y = fnc(v1.y, &t[1]); \ 179 r.z = fnc(v1.z, &t[2]); \ 180 v2->x = t[0]; \ 181 v2->y = t[1]; \ 182 v2->z = t[2]; \ 183 return r; \ 184} \ 185extern float4 __attribute__((overloadable)) \ 186 fnc(float4 v1, float4 *v2) { \ 187 float4 r; \ 188 float t[4]; \ 189 r.x = fnc(v1.x, &t[0]); \ 190 r.y = fnc(v1.y, &t[1]); \ 191 r.z = fnc(v1.z, &t[2]); \ 192 r.w = fnc(v1.w, &t[3]); \ 193 v2->x = t[0]; \ 194 v2->y = t[1]; \ 195 v2->z = t[2]; \ 196 v2->w = t[3]; \ 197 return r; \ 198} 199 200#define FN_FUNC_FN_PIN(fnc) \ 201extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \ 202 float2 r; \ 203 int t[2]; \ 204 r.x = fnc(v1.x, &t[0]); \ 205 r.y = fnc(v1.y, &t[1]); \ 206 v2->x = t[0]; \ 207 v2->y = t[1]; \ 208 return r; \ 209} \ 210extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \ 211 float3 r; \ 212 int t[3]; \ 213 r.x = fnc(v1.x, &t[0]); \ 214 r.y = fnc(v1.y, &t[1]); \ 215 r.z = fnc(v1.z, &t[2]); \ 216 v2->x = t[0]; \ 217 v2->y = t[1]; \ 218 v2->z = t[2]; \ 219 return r; \ 220} \ 221extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \ 222 float4 r; \ 223 int t[4]; \ 224 r.x = fnc(v1.x, &t[0]); \ 225 r.y = fnc(v1.y, &t[1]); \ 226 r.z = fnc(v1.z, &t[2]); \ 227 r.w = fnc(v1.w, &t[3]); \ 228 v2->x = t[0]; \ 229 v2->y = t[1]; \ 230 v2->z = t[2]; \ 231 v2->w = t[3]; \ 232 return r; \ 233} 234 235#define FN_FUNC_FN_FN_FN(fnc) \ 236extern float2 __attribute__((overloadable)) \ 237 fnc(float2 v1, float2 v2, float2 v3) { \ 238 float2 r; \ 239 r.x = fnc(v1.x, v2.x, v3.x); \ 240 r.y = fnc(v1.y, v2.y, v3.y); \ 241 return r; \ 242} \ 243extern float3 __attribute__((overloadable)) \ 244 fnc(float3 v1, float3 v2, float3 v3) { \ 245 float3 r; \ 246 r.x = fnc(v1.x, v2.x, v3.x); \ 247 r.y = fnc(v1.y, v2.y, v3.y); \ 248 r.z = fnc(v1.z, v2.z, v3.z); \ 249 return r; \ 250} \ 251extern float4 __attribute__((overloadable)) \ 252 fnc(float4 v1, float4 v2, float4 v3) { \ 253 float4 r; \ 254 r.x = fnc(v1.x, v2.x, v3.x); \ 255 r.y = fnc(v1.y, v2.y, v3.y); \ 256 r.z = fnc(v1.z, v2.z, v3.z); \ 257 r.w = fnc(v1.w, v2.w, v3.w); \ 258 return r; \ 259} 260 261#define FN_FUNC_FN_FN_PIN(fnc) \ 262extern float2 __attribute__((overloadable)) \ 263 fnc(float2 v1, float2 v2, int2 *v3) { \ 264 float2 r; \ 265 int t[2]; \ 266 r.x = fnc(v1.x, v2.x, &t[0]); \ 267 r.y = fnc(v1.y, v2.y, &t[1]); \ 268 v3->x = t[0]; \ 269 v3->y = t[1]; \ 270 return r; \ 271} \ 272extern float3 __attribute__((overloadable)) \ 273 fnc(float3 v1, float3 v2, int3 *v3) { \ 274 float3 r; \ 275 int t[3]; \ 276 r.x = fnc(v1.x, v2.x, &t[0]); \ 277 r.y = fnc(v1.y, v2.y, &t[1]); \ 278 r.z = fnc(v1.z, v2.z, &t[2]); \ 279 v3->x = t[0]; \ 280 v3->y = t[1]; \ 281 v3->z = t[2]; \ 282 return r; \ 283} \ 284extern float4 __attribute__((overloadable)) \ 285 fnc(float4 v1, float4 v2, int4 *v3) { \ 286 float4 r; \ 287 int t[4]; \ 288 r.x = fnc(v1.x, v2.x, &t[0]); \ 289 r.y = fnc(v1.y, v2.y, &t[1]); \ 290 r.z = fnc(v1.z, v2.z, &t[2]); \ 291 r.w = fnc(v1.w, v2.w, &t[3]); \ 292 v3->x = t[0]; \ 293 v3->y = t[1]; \ 294 v3->z = t[2]; \ 295 v3->w = t[3]; \ 296 return r; \ 297} 298 299static const int iposinf = 0x7f800000; 300static const int ineginf = 0xff800000; 301 302static const float posinf() { 303 float f = *((float*)&iposinf); 304 return f; 305} 306 307static const float neginf() { 308 float f = *((float*)&ineginf); 309 return f; 310} 311 312static bool isinf(float f) { 313 int i = *((int*)(void*)&f); 314 return (i == iposinf) || (i == ineginf); 315} 316 317static bool isnan(float f) { 318 int i = *((int*)(void*)&f); 319 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff)); 320} 321 322static bool isposzero(float f) { 323 int i = *((int*)(void*)&f); 324 return (i == 0x00000000); 325} 326 327static bool isnegzero(float f) { 328 int i = *((int*)(void*)&f); 329 return (i == 0x80000000); 330} 331 332static bool iszero(float f) { 333 return isposzero(f) || isnegzero(f); 334} 335 336 337extern float __attribute__((overloadable)) SC_acosf(float); 338float __attribute__((overloadable)) acos(float v) { 339 return SC_acosf(v); 340} 341FN_FUNC_FN(acos) 342 343extern float __attribute__((overloadable)) SC_acoshf(float); 344float __attribute__((overloadable)) acosh(float v) { 345 return SC_acoshf(v); 346} 347FN_FUNC_FN(acosh) 348 349 350extern float __attribute__((overloadable)) acospi(float v) { 351 return acos(v) / M_PI; 352} 353FN_FUNC_FN(acospi) 354 355extern float __attribute__((overloadable)) SC_asinf(float); 356float __attribute__((overloadable)) asin(float v) { 357 return SC_asinf(v); 358} 359FN_FUNC_FN(asin) 360 361extern float __attribute__((overloadable)) SC_asinhf(float); 362float __attribute__((overloadable)) asinh(float v) { 363 return SC_asinhf(v); 364} 365FN_FUNC_FN(asinh) 366 367extern float __attribute__((overloadable)) asinpi(float v) { 368 return asin(v) / M_PI; 369} 370FN_FUNC_FN(asinpi) 371 372extern float __attribute__((overloadable)) SC_atanf(float); 373float __attribute__((overloadable)) atan(float v) { 374 return SC_atanf(v); 375} 376FN_FUNC_FN(atan) 377 378extern float __attribute__((overloadable)) SC_atan2f(float, float); 379float __attribute__((overloadable)) atan2(float v1, float v2) { 380 return SC_atan2f(v1, v2); 381} 382FN_FUNC_FN_FN(atan2) 383 384extern float __attribute__((overloadable)) SC_atanhf(float); 385float __attribute__((overloadable)) atanh(float v) { 386 return SC_atanhf(v); 387} 388FN_FUNC_FN(atanh) 389 390extern float __attribute__((overloadable)) atanpi(float v) { 391 return atan(v) / M_PI; 392} 393FN_FUNC_FN(atanpi) 394 395 396extern float __attribute__((overloadable)) atan2pi(float y, float x) { 397 return atan2(y, x) / M_PI; 398} 399FN_FUNC_FN_FN(atan2pi) 400 401extern float __attribute__((overloadable)) SC_cbrtf(float); 402float __attribute__((overloadable)) cbrt(float v) { 403 return SC_cbrtf(v); 404} 405FN_FUNC_FN(cbrt) 406 407extern float __attribute__((overloadable)) SC_ceilf(float); 408float __attribute__((overloadable)) ceil(float v) { 409 return SC_ceilf(v); 410} 411FN_FUNC_FN(ceil) 412 413extern float __attribute__((overloadable)) SC_copysignf(float, float); 414float __attribute__((overloadable)) copysign(float v1, float v2) { 415 return SC_copysignf(v1, v2); 416} 417FN_FUNC_FN_FN(copysign) 418 419extern float __attribute__((overloadable)) SC_cosf(float); 420float __attribute__((overloadable)) cos(float v) { 421 return SC_cosf(v); 422} 423FN_FUNC_FN(cos) 424 425extern float __attribute__((overloadable)) SC_coshf(float); 426float __attribute__((overloadable)) cosh(float v) { 427 return SC_coshf(v); 428} 429FN_FUNC_FN(cosh) 430 431extern float __attribute__((overloadable)) cospi(float v) { 432 return cos(v * M_PI); 433} 434FN_FUNC_FN(cospi) 435 436extern float __attribute__((overloadable)) SC_erfcf(float); 437float __attribute__((overloadable)) erfc(float v) { 438 return SC_erfcf(v); 439} 440FN_FUNC_FN(erfc) 441 442extern float __attribute__((overloadable)) SC_erff(float); 443float __attribute__((overloadable)) erf(float v) { 444 return SC_erff(v); 445} 446FN_FUNC_FN(erf) 447 448extern float __attribute__((overloadable)) SC_expf(float); 449float __attribute__((overloadable)) exp(float v) { 450 return SC_expf(v); 451} 452FN_FUNC_FN(exp) 453 454extern float __attribute__((overloadable)) SC_exp2f(float); 455float __attribute__((overloadable)) exp2(float v) { 456 return SC_exp2f(v); 457} 458FN_FUNC_FN(exp2) 459 460extern float __attribute__((overloadable)) pow(float, float); 461 462extern float __attribute__((overloadable)) exp10(float v) { 463 return exp2(v * 3.321928095f); 464} 465FN_FUNC_FN(exp10) 466 467extern float __attribute__((overloadable)) SC_expm1f(float); 468float __attribute__((overloadable)) expm1(float v) { 469 return SC_expm1f(v); 470} 471FN_FUNC_FN(expm1) 472 473extern float __attribute__((overloadable)) fabs(float v) { 474 int i = *((int*)(void*)&v) & 0x7fffffff; 475 return *((float*)(void*)&i); 476} 477FN_FUNC_FN(fabs) 478 479extern float __attribute__((overloadable)) SC_fdimf(float, float); 480float __attribute__((overloadable)) fdim(float v1, float v2) { 481 return SC_fdimf(v1, v2); 482} 483FN_FUNC_FN_FN(fdim) 484 485extern float __attribute__((overloadable)) SC_floorf(float); 486float __attribute__((overloadable)) floor(float v) { 487 return SC_floorf(v); 488} 489FN_FUNC_FN(floor) 490 491extern float __attribute__((overloadable)) SC_fmaf(float, float, float); 492float __attribute__((overloadable)) fma(float v1, float v2, float v3) { 493 return SC_fmaf(v1, v2, v3); 494} 495FN_FUNC_FN_FN_FN(fma) 496 497extern float __attribute__((overloadable)) SC_fminf(float, float); 498 499extern float __attribute__((overloadable)) SC_fmodf(float, float); 500float __attribute__((overloadable)) fmod(float v1, float v2) { 501 return SC_fmodf(v1, v2); 502} 503FN_FUNC_FN_FN(fmod) 504 505extern float __attribute__((overloadable)) fract(float v, float *iptr) { 506 int i = (int)floor(v); 507 if (iptr) { 508 iptr[0] = i; 509 } 510 return fmin(v - i, 0x1.fffffep-1f); 511} 512FN_FUNC_FN_PFN(fract) 513 514extern float __attribute__((const, overloadable)) fract(float v) { 515 float unused; 516 return fract(v, &unused); 517} 518FN_FUNC_FN(fract) 519 520extern float __attribute__((overloadable)) SC_frexpf(float, int *); 521float __attribute__((overloadable)) frexp(float v1, int* v2) { 522 return SC_frexpf(v1, v2); 523} 524FN_FUNC_FN_PIN(frexp) 525 526extern float __attribute__((overloadable)) SC_hypotf(float, float); 527float __attribute__((overloadable)) hypot(float v1, float v2) { 528 return SC_hypotf(v1, v2); 529} 530FN_FUNC_FN_FN(hypot) 531 532extern int __attribute__((overloadable)) SC_ilogbf(float); 533int __attribute__((overloadable)) ilogb(float v) { 534 return SC_ilogbf(v); 535} 536IN_FUNC_FN(ilogb) 537 538extern float __attribute__((overloadable)) SC_ldexpf(float, int); 539float __attribute__((overloadable)) ldexp(float v1, int v2) { 540 return SC_ldexpf(v1, v2); 541} 542FN_FUNC_FN_IN(ldexp) 543FN_FUNC_FN_I(ldexp) 544 545extern float __attribute__((overloadable)) SC_lgammaf(float); 546float __attribute__((overloadable)) lgamma(float v) { 547 return SC_lgammaf(v); 548} 549FN_FUNC_FN(lgamma) 550extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*); 551float __attribute__((overloadable)) lgamma(float v, int* ptr) { 552 return SC_lgammaf_r(v, ptr); 553} 554FN_FUNC_FN_PIN(lgamma) 555 556extern float __attribute__((overloadable)) SC_logf(float); 557float __attribute__((overloadable)) log(float v) { 558 return SC_logf(v); 559} 560FN_FUNC_FN(log) 561 562extern float __attribute__((overloadable)) SC_log10f(float); 563float __attribute__((overloadable)) log10(float v) { 564 return SC_log10f(v); 565} 566FN_FUNC_FN(log10) 567 568 569extern float __attribute__((overloadable)) log2(float v) { 570 return log10(v) * 3.321928095f; 571} 572FN_FUNC_FN(log2) 573 574extern float __attribute__((overloadable)) SC_log1pf(float); 575float __attribute__((overloadable)) log1p(float v) { 576 return SC_log1pf(v); 577} 578FN_FUNC_FN(log1p) 579 580extern float __attribute__((overloadable)) SC_logbf(float); 581float __attribute__((overloadable)) logb(float v) { 582 return SC_logbf(v); 583} 584FN_FUNC_FN(logb) 585 586extern float __attribute__((overloadable)) mad(float a, float b, float c) { 587 return a * b + c; 588} 589extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) { 590 return a * b + c; 591} 592extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) { 593 return a * b + c; 594} 595extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) { 596 return a * b + c; 597} 598 599extern float __attribute__((overloadable)) SC_modff(float, float *); 600float __attribute__((overloadable)) modf(float v1, float *v2) { 601 return SC_modff(v1, v2); 602} 603FN_FUNC_FN_PFN(modf); 604 605extern float __attribute__((overloadable)) nan(uint v) { 606 float f[1]; 607 uint32_t *ip = (uint32_t *)f; 608 *ip = v | 0x7fc00000; 609 return f[0]; 610} 611 612extern float __attribute__((overloadable)) SC_nextafterf(float, float); 613float __attribute__((overloadable)) nextafter(float v1, float v2) { 614 return SC_nextafterf(v1, v2); 615} 616FN_FUNC_FN_FN(nextafter) 617 618FN_FUNC_FN_FN(pow) 619 620extern float __attribute__((overloadable)) pown(float v, int p) { 621 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31). 622 * For very large ints, we'll lose whether the exponent is even or odd, making 623 * the selection of a correct sign incorrect. We correct this. Use copysign 624 * to handle the negative zero case. 625 */ 626 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f; 627 float f = pow(v, (float)p); 628 return copysign(f, sign); 629} 630FN_FUNC_FN_IN(pown) 631 632extern float __attribute__((overloadable)) powr(float v, float p) { 633 return pow(v, p); 634} 635extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) { 636 return pow(v, p); 637} 638extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) { 639 return pow(v, p); 640} 641extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) { 642 return pow(v, p); 643} 644 645extern float __attribute__((overloadable)) SC_remainderf(float, float); 646float __attribute__((overloadable)) remainder(float v1, float v2) { 647 return SC_remainderf(v1, v2); 648} 649FN_FUNC_FN_FN(remainder) 650 651extern float __attribute__((overloadable)) SC_remquof(float, float, int *); 652float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) { 653 return SC_remquof(v1, v2, v3); 654} 655FN_FUNC_FN_FN_PIN(remquo) 656 657extern float __attribute__((overloadable)) SC_rintf(float); 658float __attribute__((overloadable)) rint(float v) { 659 return SC_rintf(v); 660} 661FN_FUNC_FN(rint) 662 663extern float __attribute__((overloadable)) rootn(float v, int r) { 664 if (r == 0) { 665 return posinf(); 666 } 667 668 if (iszero(v)) { 669 if (r < 0) { 670 if (r & 1) { 671 return copysign(posinf(), v); 672 } else { 673 return posinf(); 674 } 675 } else { 676 if (r & 1) { 677 return copysign(0.f, v); 678 } else { 679 return 0.f; 680 } 681 } 682 } 683 684 if (!isinf(v) && !isnan(v) && (v < 0.f)) { 685 if (r & 1) { 686 return (-1.f * pow(-1.f * v, 1.f / r)); 687 } else { 688 return nan(0); 689 } 690 } 691 692 return pow(v, 1.f / r); 693} 694FN_FUNC_FN_IN(rootn); 695 696extern float __attribute__((overloadable)) SC_roundf(float); 697float __attribute__((overloadable)) round(float v) { 698 return SC_roundf(v); 699} 700FN_FUNC_FN(round) 701 702extern float __attribute__((overloadable)) SC_randf2(float, float); 703float __attribute__((overloadable)) rsRand(float min, float max) { 704 return SC_randf2(min, max); 705} 706 707 708extern float __attribute__((overloadable)) rsqrt(float v) { 709 return 1.f / sqrt(v); 710} 711 712#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 713// These functions must be defined here if we are not using the SSE 714// implementation, which includes when we are built as part of the 715// debug runtime (libclcore_debug.bc). 716FN_FUNC_FN(sqrt) 717#else 718extern float2 __attribute__((overloadable)) sqrt(float2); 719extern float3 __attribute__((overloadable)) sqrt(float3); 720extern float4 __attribute__((overloadable)) sqrt(float4); 721#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 722 723FN_FUNC_FN(rsqrt) 724 725extern float __attribute__((overloadable)) SC_sinf(float); 726float __attribute__((overloadable)) sin(float v) { 727 return SC_sinf(v); 728} 729FN_FUNC_FN(sin) 730 731extern float __attribute__((overloadable)) sincos(float v, float *cosptr) { 732 *cosptr = cos(v); 733 return sin(v); 734} 735extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) { 736 *cosptr = cos(v); 737 return sin(v); 738} 739extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) { 740 *cosptr = cos(v); 741 return sin(v); 742} 743extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) { 744 *cosptr = cos(v); 745 return sin(v); 746} 747 748extern float __attribute__((overloadable)) SC_sinhf(float); 749float __attribute__((overloadable)) sinh(float v) { 750 return SC_sinhf(v); 751} 752FN_FUNC_FN(sinh) 753 754extern float __attribute__((overloadable)) sinpi(float v) { 755 return sin(v * M_PI); 756} 757FN_FUNC_FN(sinpi) 758 759extern float __attribute__((overloadable)) SC_tanf(float); 760float __attribute__((overloadable)) tan(float v) { 761 return SC_tanf(v); 762} 763FN_FUNC_FN(tan) 764 765extern float __attribute__((overloadable)) SC_tanhf(float); 766float __attribute__((overloadable)) tanh(float v) { 767 return SC_tanhf(v); 768} 769FN_FUNC_FN(tanh) 770 771extern float __attribute__((overloadable)) tanpi(float v) { 772 return tan(v * M_PI); 773} 774FN_FUNC_FN(tanpi) 775 776 777extern float __attribute__((overloadable)) SC_tgammaf(float); 778float __attribute__((overloadable)) tgamma(float v) { 779 return SC_tgammaf(v); 780} 781FN_FUNC_FN(tgamma) 782 783extern float __attribute__((overloadable)) SC_truncf(float); 784float __attribute__((overloadable)) trunc(float v) { 785 return SC_truncf(v); 786} 787FN_FUNC_FN(trunc) 788 789// Int ops (partial), 6.11.3 790 791#define XN_FUNC_YN(typeout, fnc, typein) \ 792extern typeout __attribute__((overloadable)) fnc(typein); \ 793extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \ 794 typeout##2 r; \ 795 r.x = fnc(v.x); \ 796 r.y = fnc(v.y); \ 797 return r; \ 798} \ 799extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \ 800 typeout##3 r; \ 801 r.x = fnc(v.x); \ 802 r.y = fnc(v.y); \ 803 r.z = fnc(v.z); \ 804 return r; \ 805} \ 806extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \ 807 typeout##4 r; \ 808 r.x = fnc(v.x); \ 809 r.y = fnc(v.y); \ 810 r.z = fnc(v.z); \ 811 r.w = fnc(v.w); \ 812 return r; \ 813} 814 815 816#define UIN_FUNC_IN(fnc) \ 817XN_FUNC_YN(uchar, fnc, char) \ 818XN_FUNC_YN(ushort, fnc, short) \ 819XN_FUNC_YN(uint, fnc, int) 820 821#define IN_FUNC_IN(fnc) \ 822XN_FUNC_YN(uchar, fnc, uchar) \ 823XN_FUNC_YN(char, fnc, char) \ 824XN_FUNC_YN(ushort, fnc, ushort) \ 825XN_FUNC_YN(short, fnc, short) \ 826XN_FUNC_YN(uint, fnc, uint) \ 827XN_FUNC_YN(int, fnc, int) 828 829 830#define XN_FUNC_XN_XN_BODY(type, fnc, body) \ 831extern type __attribute__((overloadable)) \ 832 fnc(type v1, type v2) { \ 833 return body; \ 834} \ 835extern type##2 __attribute__((overloadable)) \ 836 fnc(type##2 v1, type##2 v2) { \ 837 type##2 r; \ 838 r.x = fnc(v1.x, v2.x); \ 839 r.y = fnc(v1.y, v2.y); \ 840 return r; \ 841} \ 842extern type##3 __attribute__((overloadable)) \ 843 fnc(type##3 v1, type##3 v2) { \ 844 type##3 r; \ 845 r.x = fnc(v1.x, v2.x); \ 846 r.y = fnc(v1.y, v2.y); \ 847 r.z = fnc(v1.z, v2.z); \ 848 return r; \ 849} \ 850extern type##4 __attribute__((overloadable)) \ 851 fnc(type##4 v1, type##4 v2) { \ 852 type##4 r; \ 853 r.x = fnc(v1.x, v2.x); \ 854 r.y = fnc(v1.y, v2.y); \ 855 r.z = fnc(v1.z, v2.z); \ 856 r.w = fnc(v1.w, v2.w); \ 857 return r; \ 858} 859 860#define IN_FUNC_IN_IN_BODY(fnc, body) \ 861XN_FUNC_XN_XN_BODY(uchar, fnc, body) \ 862XN_FUNC_XN_XN_BODY(char, fnc, body) \ 863XN_FUNC_XN_XN_BODY(ushort, fnc, body) \ 864XN_FUNC_XN_XN_BODY(short, fnc, body) \ 865XN_FUNC_XN_XN_BODY(uint, fnc, body) \ 866XN_FUNC_XN_XN_BODY(int, fnc, body) \ 867XN_FUNC_XN_XN_BODY(float, fnc, body) 868 869 870/** 871 * abs 872 */ 873extern uint32_t __attribute__((overloadable)) abs(int32_t v) { 874 if (v < 0) 875 return -v; 876 return v; 877} 878extern uint16_t __attribute__((overloadable)) abs(int16_t v) { 879 if (v < 0) 880 return -v; 881 return v; 882} 883extern uint8_t __attribute__((overloadable)) abs(int8_t v) { 884 if (v < 0) 885 return -v; 886 return v; 887} 888 889/** 890 * clz 891 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be 892 * expanded to 32 bits. For our smaller data types, we need to subtract off 893 * these unused top bits (that will be always be composed of zeros). 894 */ 895extern uint32_t __attribute__((overloadable)) clz(uint32_t v) { 896 return __builtin_clz(v); 897} 898extern uint16_t __attribute__((overloadable)) clz(uint16_t v) { 899 return __builtin_clz(v) - 16; 900} 901extern uint8_t __attribute__((overloadable)) clz(uint8_t v) { 902 return __builtin_clz(v) - 24; 903} 904extern int32_t __attribute__((overloadable)) clz(int32_t v) { 905 return __builtin_clz(v); 906} 907extern int16_t __attribute__((overloadable)) clz(int16_t v) { 908 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16; 909} 910extern int8_t __attribute__((overloadable)) clz(int8_t v) { 911 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24; 912} 913 914 915UIN_FUNC_IN(abs) 916IN_FUNC_IN(clz) 917 918 919// 6.11.4 920 921 922extern float __attribute__((overloadable)) degrees(float radians) { 923 return radians * (180.f / M_PI); 924} 925extern float2 __attribute__((overloadable)) degrees(float2 radians) { 926 return radians * (180.f / M_PI); 927} 928extern float3 __attribute__((overloadable)) degrees(float3 radians) { 929 return radians * (180.f / M_PI); 930} 931extern float4 __attribute__((overloadable)) degrees(float4 radians) { 932 return radians * (180.f / M_PI); 933} 934 935extern float __attribute__((overloadable)) mix(float start, float stop, float amount) { 936 return start + (stop - start) * amount; 937} 938extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) { 939 return start + (stop - start) * amount; 940} 941extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) { 942 return start + (stop - start) * amount; 943} 944extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) { 945 return start + (stop - start) * amount; 946} 947extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) { 948 return start + (stop - start) * amount; 949} 950extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) { 951 return start + (stop - start) * amount; 952} 953extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) { 954 return start + (stop - start) * amount; 955} 956 957extern float __attribute__((overloadable)) radians(float degrees) { 958 return degrees * (M_PI / 180.f); 959} 960extern float2 __attribute__((overloadable)) radians(float2 degrees) { 961 return degrees * (M_PI / 180.f); 962} 963extern float3 __attribute__((overloadable)) radians(float3 degrees) { 964 return degrees * (M_PI / 180.f); 965} 966extern float4 __attribute__((overloadable)) radians(float4 degrees) { 967 return degrees * (M_PI / 180.f); 968} 969 970extern float __attribute__((overloadable)) step(float edge, float v) { 971 return (v < edge) ? 0.f : 1.f; 972} 973extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) { 974 float2 r; 975 r.x = (v.x < edge.x) ? 0.f : 1.f; 976 r.y = (v.y < edge.y) ? 0.f : 1.f; 977 return r; 978} 979extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) { 980 float3 r; 981 r.x = (v.x < edge.x) ? 0.f : 1.f; 982 r.y = (v.y < edge.y) ? 0.f : 1.f; 983 r.z = (v.z < edge.z) ? 0.f : 1.f; 984 return r; 985} 986extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) { 987 float4 r; 988 r.x = (v.x < edge.x) ? 0.f : 1.f; 989 r.y = (v.y < edge.y) ? 0.f : 1.f; 990 r.z = (v.z < edge.z) ? 0.f : 1.f; 991 r.w = (v.w < edge.w) ? 0.f : 1.f; 992 return r; 993} 994extern float2 __attribute__((overloadable)) step(float2 edge, float v) { 995 float2 r; 996 r.x = (v < edge.x) ? 0.f : 1.f; 997 r.y = (v < edge.y) ? 0.f : 1.f; 998 return r; 999} 1000extern float3 __attribute__((overloadable)) step(float3 edge, float v) { 1001 float3 r; 1002 r.x = (v < edge.x) ? 0.f : 1.f; 1003 r.y = (v < edge.y) ? 0.f : 1.f; 1004 r.z = (v < edge.z) ? 0.f : 1.f; 1005 return r; 1006} 1007extern float4 __attribute__((overloadable)) step(float4 edge, float v) { 1008 float4 r; 1009 r.x = (v < edge.x) ? 0.f : 1.f; 1010 r.y = (v < edge.y) ? 0.f : 1.f; 1011 r.z = (v < edge.z) ? 0.f : 1.f; 1012 r.w = (v < edge.w) ? 0.f : 1.f; 1013 return r; 1014} 1015extern float2 __attribute__((overloadable)) step(float edge, float2 v) { 1016 float2 r; 1017 r.x = (v.x < edge) ? 0.f : 1.f; 1018 r.y = (v.y < edge) ? 0.f : 1.f; 1019 return r; 1020} 1021extern float3 __attribute__((overloadable)) step(float edge, float3 v) { 1022 float3 r; 1023 r.x = (v.x < edge) ? 0.f : 1.f; 1024 r.y = (v.y < edge) ? 0.f : 1.f; 1025 r.z = (v.z < edge) ? 0.f : 1.f; 1026 return r; 1027} 1028extern float4 __attribute__((overloadable)) step(float edge, float4 v) { 1029 float4 r; 1030 r.x = (v.x < edge) ? 0.f : 1.f; 1031 r.y = (v.y < edge) ? 0.f : 1.f; 1032 r.z = (v.z < edge) ? 0.f : 1.f; 1033 r.w = (v.w < edge) ? 0.f : 1.f; 1034 return r; 1035} 1036 1037extern float __attribute__((overloadable)) sign(float v) { 1038 if (v > 0) return 1.f; 1039 if (v < 0) return -1.f; 1040 return v; 1041} 1042FN_FUNC_FN(sign) 1043 1044 1045// 6.11.5 1046extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) { 1047 float3 r; 1048 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1049 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1050 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1051 return r; 1052} 1053 1054extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) { 1055 float4 r; 1056 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1057 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1058 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1059 r.w = 0.f; 1060 return r; 1061} 1062 1063#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 1064// These functions must be defined here if we are not using the SSE 1065// implementation, which includes when we are built as part of the 1066// debug runtime (libclcore_debug.bc). 1067 1068extern float __attribute__((overloadable)) dot(float lhs, float rhs) { 1069 return lhs * rhs; 1070} 1071extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) { 1072 return lhs.x*rhs.x + lhs.y*rhs.y; 1073} 1074extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) { 1075 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z; 1076} 1077extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) { 1078 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w; 1079} 1080 1081extern float __attribute__((overloadable)) length(float v) { 1082 return fabs(v); 1083} 1084extern float __attribute__((overloadable)) length(float2 v) { 1085 return sqrt(v.x*v.x + v.y*v.y); 1086} 1087extern float __attribute__((overloadable)) length(float3 v) { 1088 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1089} 1090extern float __attribute__((overloadable)) length(float4 v) { 1091 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1092} 1093 1094#else 1095 1096extern float __attribute__((overloadable)) length(float v); 1097extern float __attribute__((overloadable)) length(float2 v); 1098extern float __attribute__((overloadable)) length(float3 v); 1099extern float __attribute__((overloadable)) length(float4 v); 1100 1101#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 1102 1103extern float __attribute__((overloadable)) distance(float lhs, float rhs) { 1104 return length(lhs - rhs); 1105} 1106extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) { 1107 return length(lhs - rhs); 1108} 1109extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) { 1110 return length(lhs - rhs); 1111} 1112extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) { 1113 return length(lhs - rhs); 1114} 1115 1116/* For the normalization functions, vectors of length 0 should simply be 1117 * returned (i.e. all the components of that vector are 0). 1118 */ 1119extern float __attribute__((overloadable)) normalize(float v) { 1120 if (v == 0.0f) { 1121 return 0.0f; 1122 } else if (v < 0.0f) { 1123 return -1.0f; 1124 } else { 1125 return 1.0f; 1126 } 1127} 1128extern float2 __attribute__((overloadable)) normalize(float2 v) { 1129 float l = length(v); 1130 return l == 0.0f ? v : v / l; 1131} 1132extern float3 __attribute__((overloadable)) normalize(float3 v) { 1133 float l = length(v); 1134 return l == 0.0f ? v : v / l; 1135} 1136extern float4 __attribute__((overloadable)) normalize(float4 v) { 1137 float l = length(v); 1138 return l == 0.0f ? v : v / l; 1139} 1140 1141extern float __attribute__((overloadable)) half_sqrt(float v) { 1142 return sqrt(v); 1143} 1144FN_FUNC_FN(half_sqrt) 1145 1146extern float __attribute__((overloadable)) fast_length(float v) { 1147 return fabs(v); 1148} 1149extern float __attribute__((overloadable)) fast_length(float2 v) { 1150 return half_sqrt(v.x*v.x + v.y*v.y); 1151} 1152extern float __attribute__((overloadable)) fast_length(float3 v) { 1153 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1154} 1155extern float __attribute__((overloadable)) fast_length(float4 v) { 1156 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1157} 1158 1159extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) { 1160 return fast_length(lhs - rhs); 1161} 1162extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) { 1163 return fast_length(lhs - rhs); 1164} 1165extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) { 1166 return fast_length(lhs - rhs); 1167} 1168extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) { 1169 return fast_length(lhs - rhs); 1170} 1171 1172extern float __attribute__((overloadable)) half_rsqrt(float); 1173 1174/* For the normalization functions, vectors of length 0 should simply be 1175 * returned (i.e. all the components of that vector are 0). 1176 */ 1177extern float __attribute__((overloadable)) fast_normalize(float v) { 1178 if (v == 0.0f) { 1179 return 0.0f; 1180 } else if (v < 0.0f) { 1181 return -1.0f; 1182 } else { 1183 return 1.0f; 1184 } 1185} 1186// If the length is 0, then rlength should be NaN. 1187extern float2 __attribute__((overloadable)) fast_normalize(float2 v) { 1188 float rlength = half_rsqrt(v.x*v.x + v.y*v.y); 1189 return (rlength == rlength) ? v * rlength : v; 1190} 1191extern float3 __attribute__((overloadable)) fast_normalize(float3 v) { 1192 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1193 return (rlength == rlength) ? v * rlength : v; 1194} 1195extern float4 __attribute__((overloadable)) fast_normalize(float4 v) { 1196 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1197 return (rlength == rlength) ? v * rlength : v; 1198} 1199 1200extern float __attribute__((overloadable)) half_recip(float v) { 1201 return 1.f / v; 1202} 1203 1204/* 1205extern float __attribute__((overloadable)) approx_atan(float x) { 1206 if (x == 0.f) 1207 return 0.f; 1208 if (x < 0.f) 1209 return -1.f * approx_atan(-1.f * x); 1210 if (x > 1.f) 1211 return M_PI_2 - approx_atan(approx_recip(x)); 1212 return x * approx_recip(1.f + 0.28f * x*x); 1213} 1214FN_FUNC_FN(approx_atan) 1215*/ 1216 1217typedef union 1218{ 1219 float fv; 1220 int32_t iv; 1221} ieee_float_shape_type; 1222 1223/* Get a 32 bit int from a float. */ 1224 1225#define GET_FLOAT_WORD(i,d) \ 1226do { \ 1227 ieee_float_shape_type gf_u; \ 1228 gf_u.fv = (d); \ 1229 (i) = gf_u.iv; \ 1230} while (0) 1231 1232/* Set a float from a 32 bit int. */ 1233 1234#define SET_FLOAT_WORD(d,i) \ 1235do { \ 1236 ieee_float_shape_type sf_u; \ 1237 sf_u.iv = (i); \ 1238 (d) = sf_u.fv; \ 1239} while (0) 1240 1241 1242 1243// Valid -125 to 125 1244extern float __attribute__((overloadable)) native_exp2(float v) { 1245 int32_t iv = (int)v; 1246 int32_t x = iv + (iv >> 31); // ~floor(v) 1247 float r = (v - x); 1248 1249 float fo; 1250 SET_FLOAT_WORD(fo, (x + 127) << 23); 1251 1252 r *= 0.694f; // ~ log(e) / log(2) 1253 float r2 = r*r; 1254 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1255 return fo * adj; 1256} 1257 1258extern float2 __attribute__((overloadable)) native_exp2(float2 v) { 1259 int2 iv = convert_int2(v); 1260 int2 x = iv + (iv >> (int2)31);//floor(v); 1261 float2 r = (v - convert_float2(x)); 1262 1263 x += 127; 1264 1265 float2 fo = (float2)(x << (int2)23); 1266 1267 r *= 0.694f; // ~ log(e) / log(2) 1268 float2 r2 = r*r; 1269 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1270 return fo * adj; 1271} 1272 1273extern float4 __attribute__((overloadable)) native_exp2(float4 v) { 1274 int4 iv = convert_int4(v); 1275 int4 x = iv + (iv >> (int4)31);//floor(v); 1276 float4 r = (v - convert_float4(x)); 1277 1278 x += 127; 1279 1280 float4 fo = (float4)(x << (int4)23); 1281 1282 r *= 0.694f; // ~ log(e) / log(2) 1283 float4 r2 = r*r; 1284 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1285 return fo * adj; 1286} 1287 1288extern float3 __attribute__((overloadable)) native_exp2(float3 v) { 1289 float4 t = 1.f; 1290 t.xyz = v; 1291 return native_exp2(t).xyz; 1292} 1293 1294 1295extern float __attribute__((overloadable)) native_exp(float v) { 1296 return native_exp2(v * 1.442695041f); 1297} 1298extern float2 __attribute__((overloadable)) native_exp(float2 v) { 1299 return native_exp2(v * 1.442695041f); 1300} 1301extern float3 __attribute__((overloadable)) native_exp(float3 v) { 1302 return native_exp2(v * 1.442695041f); 1303} 1304extern float4 __attribute__((overloadable)) native_exp(float4 v) { 1305 return native_exp2(v * 1.442695041f); 1306} 1307 1308extern float __attribute__((overloadable)) native_exp10(float v) { 1309 return native_exp2(v * 3.321928095f); 1310} 1311extern float2 __attribute__((overloadable)) native_exp10(float2 v) { 1312 return native_exp2(v * 3.321928095f); 1313} 1314extern float3 __attribute__((overloadable)) native_exp10(float3 v) { 1315 return native_exp2(v * 3.321928095f); 1316} 1317extern float4 __attribute__((overloadable)) native_exp10(float4 v) { 1318 return native_exp2(v * 3.321928095f); 1319} 1320 1321extern float __attribute__((overloadable)) native_log2(float v) { 1322 int32_t ibits; 1323 GET_FLOAT_WORD(ibits, v); 1324 1325 int32_t e = (ibits >> 23) & 0xff; 1326 1327 ibits &= 0x7fffff; 1328 ibits |= 127 << 23; 1329 1330 float ir; 1331 SET_FLOAT_WORD(ir, ibits); 1332 ir -= 1.5f; 1333 float ir2 = ir*ir; 1334 float adj2 = (0.405465108f / 0.693147181f) + 1335 ((0.666666667f / 0.693147181f) * ir) - 1336 ((0.222222222f / 0.693147181f) * ir2) + 1337 ((0.098765432f / 0.693147181f) * ir*ir2) - 1338 ((0.049382716f / 0.693147181f) * ir2*ir2) + 1339 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) - 1340 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2); 1341 return (float)(e - 127) + adj2; 1342} 1343extern float2 __attribute__((overloadable)) native_log2(float2 v) { 1344 float2 v2 = {native_log2(v.x), native_log2(v.y)}; 1345 return v2; 1346} 1347extern float3 __attribute__((overloadable)) native_log2(float3 v) { 1348 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)}; 1349 return v2; 1350} 1351extern float4 __attribute__((overloadable)) native_log2(float4 v) { 1352 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)}; 1353 return v2; 1354} 1355 1356extern float __attribute__((overloadable)) native_log(float v) { 1357 return native_log2(v) * (1.f / 1.442695041f); 1358} 1359extern float2 __attribute__((overloadable)) native_log(float2 v) { 1360 return native_log2(v) * (1.f / 1.442695041f); 1361} 1362extern float3 __attribute__((overloadable)) native_log(float3 v) { 1363 return native_log2(v) * (1.f / 1.442695041f); 1364} 1365extern float4 __attribute__((overloadable)) native_log(float4 v) { 1366 return native_log2(v) * (1.f / 1.442695041f); 1367} 1368 1369extern float __attribute__((overloadable)) native_log10(float v) { 1370 return native_log2(v) * (1.f / 3.321928095f); 1371} 1372extern float2 __attribute__((overloadable)) native_log10(float2 v) { 1373 return native_log2(v) * (1.f / 3.321928095f); 1374} 1375extern float3 __attribute__((overloadable)) native_log10(float3 v) { 1376 return native_log2(v) * (1.f / 3.321928095f); 1377} 1378extern float4 __attribute__((overloadable)) native_log10(float4 v) { 1379 return native_log2(v) * (1.f / 3.321928095f); 1380} 1381 1382 1383extern float __attribute__((overloadable)) native_powr(float v, float y) { 1384 float v2 = native_log2(v); 1385 v2 = fmax(v2 * y, -125.f); 1386 return native_exp2(v2); 1387} 1388extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) { 1389 float2 v2 = native_log2(v); 1390 v2 = fmax(v2 * y, -125.f); 1391 return native_exp2(v2); 1392} 1393extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) { 1394 float3 v2 = native_log2(v); 1395 v2 = fmax(v2 * y, -125.f); 1396 return native_exp2(v2); 1397} 1398extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) { 1399 float4 v2 = native_log2(v); 1400 v2 = fmax(v2 * y, -125.f); 1401 return native_exp2(v2); 1402} 1403 1404extern double __attribute__((overloadable)) min(double v1, double v2) { 1405 return v1 < v2 ? v1 : v2; 1406} 1407 1408extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) { 1409 double2 r; 1410 r.x = v1.x < v2.x ? v1.x : v2.x; 1411 r.y = v1.y < v2.y ? v1.y : v2.y; 1412 return r; 1413} 1414 1415extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) { 1416 double3 r; 1417 r.x = v1.x < v2.x ? v1.x : v2.x; 1418 r.y = v1.y < v2.y ? v1.y : v2.y; 1419 r.z = v1.z < v2.z ? v1.z : v2.z; 1420 return r; 1421} 1422 1423extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) { 1424 double4 r; 1425 r.x = v1.x < v2.x ? v1.x : v2.x; 1426 r.y = v1.y < v2.y ? v1.y : v2.y; 1427 r.z = v1.z < v2.z ? v1.z : v2.z; 1428 r.w = v1.w < v2.w ? v1.w : v2.w; 1429 return r; 1430} 1431 1432extern long __attribute__((overloadable)) min(long v1, long v2) { 1433 return v1 < v2 ? v1 : v2; 1434} 1435extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) { 1436 long2 r; 1437 r.x = v1.x < v2.x ? v1.x : v2.x; 1438 r.y = v1.y < v2.y ? v1.y : v2.y; 1439 return r; 1440} 1441extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) { 1442 long3 r; 1443 r.x = v1.x < v2.x ? v1.x : v2.x; 1444 r.y = v1.y < v2.y ? v1.y : v2.y; 1445 r.z = v1.z < v2.z ? v1.z : v2.z; 1446 return r; 1447} 1448extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) { 1449 long4 r; 1450 r.x = v1.x < v2.x ? v1.x : v2.x; 1451 r.y = v1.y < v2.y ? v1.y : v2.y; 1452 r.z = v1.z < v2.z ? v1.z : v2.z; 1453 r.w = v1.w < v2.w ? v1.w : v2.w; 1454 return r; 1455} 1456 1457extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) { 1458 return v1 < v2 ? v1 : v2; 1459} 1460extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) { 1461 ulong2 r; 1462 r.x = v1.x < v2.x ? v1.x : v2.x; 1463 r.y = v1.y < v2.y ? v1.y : v2.y; 1464 return r; 1465} 1466extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) { 1467 ulong3 r; 1468 r.x = v1.x < v2.x ? v1.x : v2.x; 1469 r.y = v1.y < v2.y ? v1.y : v2.y; 1470 r.z = v1.z < v2.z ? v1.z : v2.z; 1471 return r; 1472} 1473extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) { 1474 ulong4 r; 1475 r.x = v1.x < v2.x ? v1.x : v2.x; 1476 r.y = v1.y < v2.y ? v1.y : v2.y; 1477 r.z = v1.z < v2.z ? v1.z : v2.z; 1478 r.w = v1.w < v2.w ? v1.w : v2.w; 1479 return r; 1480} 1481 1482extern double __attribute__((overloadable)) max(double v1, double v2) { 1483 return v1 > v2 ? v1 : v2; 1484} 1485 1486extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) { 1487 double2 r; 1488 r.x = v1.x > v2.x ? v1.x : v2.x; 1489 r.y = v1.y > v2.y ? v1.y : v2.y; 1490 return r; 1491} 1492 1493extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) { 1494 double3 r; 1495 r.x = v1.x > v2.x ? v1.x : v2.x; 1496 r.y = v1.y > v2.y ? v1.y : v2.y; 1497 r.z = v1.z > v2.z ? v1.z : v2.z; 1498 return r; 1499} 1500 1501extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) { 1502 double4 r; 1503 r.x = v1.x > v2.x ? v1.x : v2.x; 1504 r.y = v1.y > v2.y ? v1.y : v2.y; 1505 r.z = v1.z > v2.z ? v1.z : v2.z; 1506 r.w = v1.w > v2.w ? v1.w : v2.w; 1507 return r; 1508} 1509 1510extern long __attribute__((overloadable)) max(long v1, long v2) { 1511 return v1 > v2 ? v1 : v2; 1512} 1513extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) { 1514 long2 r; 1515 r.x = v1.x > v2.x ? v1.x : v2.x; 1516 r.y = v1.y > v2.y ? v1.y : v2.y; 1517 return r; 1518} 1519extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) { 1520 long3 r; 1521 r.x = v1.x > v2.x ? v1.x : v2.x; 1522 r.y = v1.y > v2.y ? v1.y : v2.y; 1523 r.z = v1.z > v2.z ? v1.z : v2.z; 1524 return r; 1525} 1526extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) { 1527 long4 r; 1528 r.x = v1.x > v2.x ? v1.x : v2.x; 1529 r.y = v1.y > v2.y ? v1.y : v2.y; 1530 r.z = v1.z > v2.z ? v1.z : v2.z; 1531 r.w = v1.w > v2.w ? v1.w : v2.w; 1532 return r; 1533} 1534 1535extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) { 1536 return v1 > v2 ? v1 : v2; 1537} 1538extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) { 1539 ulong2 r; 1540 r.x = v1.x > v2.x ? v1.x : v2.x; 1541 r.y = v1.y > v2.y ? v1.y : v2.y; 1542 return r; 1543} 1544extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) { 1545 ulong3 r; 1546 r.x = v1.x > v2.x ? v1.x : v2.x; 1547 r.y = v1.y > v2.y ? v1.y : v2.y; 1548 r.z = v1.z > v2.z ? v1.z : v2.z; 1549 return r; 1550} 1551extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) { 1552 ulong4 r; 1553 r.x = v1.x > v2.x ? v1.x : v2.x; 1554 r.y = v1.y > v2.y ? v1.y : v2.y; 1555 r.z = v1.z > v2.z ? v1.z : v2.z; 1556 r.w = v1.w > v2.w ? v1.w : v2.w; 1557 return r; 1558} 1559 1560#define THUNK_NATIVE_F(fn) \ 1561 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \ 1562 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \ 1563 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \ 1564 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);} 1565 1566#define THUNK_NATIVE_F_F(fn) \ 1567 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \ 1568 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \ 1569 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \ 1570 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);} 1571 1572#define THUNK_NATIVE_F_FP(fn) \ 1573 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \ 1574 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \ 1575 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \ 1576 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);} 1577 1578#define THUNK_NATIVE_F_I(fn) \ 1579 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \ 1580 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \ 1581 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \ 1582 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);} 1583 1584THUNK_NATIVE_F(acos) 1585THUNK_NATIVE_F(acosh) 1586THUNK_NATIVE_F(acospi) 1587THUNK_NATIVE_F(asin) 1588THUNK_NATIVE_F(asinh) 1589THUNK_NATIVE_F(asinpi) 1590THUNK_NATIVE_F(atan) 1591THUNK_NATIVE_F_F(atan2) 1592THUNK_NATIVE_F(atanh) 1593THUNK_NATIVE_F(atanpi) 1594THUNK_NATIVE_F_F(atan2pi) 1595THUNK_NATIVE_F(cbrt) 1596THUNK_NATIVE_F(cos) 1597THUNK_NATIVE_F(cosh) 1598THUNK_NATIVE_F(cospi) 1599THUNK_NATIVE_F(expm1) 1600THUNK_NATIVE_F_F(hypot) 1601THUNK_NATIVE_F(log1p) 1602THUNK_NATIVE_F_I(rootn) 1603THUNK_NATIVE_F(rsqrt) 1604THUNK_NATIVE_F(sqrt) 1605THUNK_NATIVE_F(sin) 1606THUNK_NATIVE_F_FP(sincos) 1607THUNK_NATIVE_F(sinh) 1608THUNK_NATIVE_F(sinpi) 1609THUNK_NATIVE_F(tan) 1610THUNK_NATIVE_F(tanh) 1611THUNK_NATIVE_F(tanpi) 1612 1613#undef THUNK_NATIVE_F 1614#undef THUNK_NATIVE_F_F 1615#undef THUNK_NATIVE_F_I 1616#undef THUNK_NATIVE_F_FP 1617 1618float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);} 1619float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);} 1620float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);} 1621float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);} 1622 1623float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);} 1624float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);} 1625float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);} 1626float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);} 1627 1628float __attribute__((overloadable)) native_length(float v) { return fast_length(v);} 1629float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);} 1630float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);} 1631float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);} 1632 1633float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;} 1634float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;} 1635float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;} 1636float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;} 1637 1638float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;} 1639float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;} 1640float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;} 1641float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;} 1642 1643 1644 1645 1646 1647#undef FN_FUNC_FN 1648#undef IN_FUNC_FN 1649#undef FN_FUNC_FN_FN 1650#undef FN_FUNC_FN_F 1651#undef FN_FUNC_FN_IN 1652#undef FN_FUNC_FN_I 1653#undef FN_FUNC_FN_PFN 1654#undef FN_FUNC_FN_PIN 1655#undef FN_FUNC_FN_FN_FN 1656#undef FN_FUNC_FN_FN_PIN 1657#undef XN_FUNC_YN 1658#undef UIN_FUNC_IN 1659#undef IN_FUNC_IN 1660#undef XN_FUNC_XN_XN_BODY 1661#undef IN_FUNC_IN_IN_BODY 1662 1663typedef union { 1664 half hval; 1665 short sval; 1666} fp16_shape_type; 1667 1668/* half h = unsigned short s; */ 1669#define SET_HALF_WORD(h, s) \ 1670do { \ 1671 fp16_shape_type fp16_u; \ 1672 fp16_u.sval = (s); \ 1673 (h) = fp16_u.hval; \ 1674} while (0) 1675 1676static const unsigned short kHalfPositiveInfinity = 0x7c00; 1677 1678/* Define f16 functions of the form 1679 * HN output = fn(HN input) 1680 * where HN is scalar or vector half type 1681 */ 1682#define HN_FUNC_HN(fn) \ 1683extern half __attribute__((overloadable)) fn(half h) { \ 1684 return (half) fn((float) h); \ 1685} \ 1686extern half2 __attribute__((overloadable)) fn(half2 v) { \ 1687 return convert_half2(fn(convert_float2(v))); \ 1688} \ 1689extern half3 __attribute__((overloadable)) fn(half3 v) { \ 1690 return convert_half3(fn(convert_float3(v))); \ 1691} \ 1692extern half4 __attribute__((overloadable)) fn(half4 v) { \ 1693 return convert_half4(fn(convert_float4(v))); \ 1694} 1695 1696/* Define f16 functions of the form 1697 * HN output = fn(HN input1, HN input2) 1698 * where HN is scalar or vector half type 1699 */ 1700#define HN_FUNC_HN_HN(fn) \ 1701extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1702 return (half) fn((float) h1, (float) h2); \ 1703} \ 1704extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1705 return convert_half2(fn(convert_float2(v1), \ 1706 convert_float2(v2))); \ 1707} \ 1708extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1709 return convert_half3(fn(convert_float3(v1), \ 1710 convert_float3(v2))); \ 1711} \ 1712extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1713 return convert_half4(fn(convert_float4(v1), \ 1714 convert_float4(v2))); \ 1715} 1716 1717/* Define f16 functions of the form 1718 * HN output = fn(HN input1, half input2) 1719 * where HN is scalar or vector half type 1720 */ 1721#define HN_FUNC_HN_H(fn) \ 1722extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \ 1723 return convert_half2(fn(convert_float2(v1), (float) v2)); \ 1724} \ 1725extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \ 1726 return convert_half3(fn(convert_float3(v1), (float) v2)); \ 1727} \ 1728extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \ 1729 return convert_half4(fn(convert_float4(v1), (float) v2)); \ 1730} 1731 1732/* Define f16 functions of the form 1733 * HN output = fn(HN input1, HN input2, HN input3) 1734 * where HN is scalar or vector half type 1735 */ 1736#define HN_FUNC_HN_HN_HN(fn) \ 1737extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \ 1738 return (half) fn((float) h1, (float) h2, (float) h3); \ 1739} \ 1740extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \ 1741 return convert_half2(fn(convert_float2(v1), \ 1742 convert_float2(v2), \ 1743 convert_float2(v3))); \ 1744} \ 1745extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \ 1746 return convert_half3(fn(convert_float3(v1), \ 1747 convert_float3(v2), \ 1748 convert_float3(v3))); \ 1749} \ 1750extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \ 1751 return convert_half4(fn(convert_float4(v1), \ 1752 convert_float4(v2), \ 1753 convert_float4(v3))); \ 1754} 1755 1756/* Define f16 functions of the form 1757 * HN output = fn(HN input1, IN input2) 1758 * where HN is scalar or vector half type and IN the equivalent integer type 1759 * of same vector length. 1760 */ 1761#define HN_FUNC_HN_IN(fn) \ 1762extern half __attribute__((overloadable)) fn(half h1, int v) { \ 1763 return (half) fn((float) h1, v); \ 1764} \ 1765extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \ 1766 return convert_half2(fn(convert_float2(v1), v2)); \ 1767} \ 1768extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \ 1769 return convert_half3(fn(convert_float3(v1), v2)); \ 1770} \ 1771extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \ 1772 return convert_half4(fn(convert_float4(v1), v2)); \ 1773} 1774 1775/* Define f16 functions of the form 1776 * half output = fn(HN input1) 1777 * where HN is a scalar or vector half type. 1778 */ 1779#define H_FUNC_HN(fn) \ 1780extern half __attribute__((overloadable)) fn(half h) { \ 1781 return (half) fn((float) h); \ 1782} \ 1783extern half __attribute__((overloadable)) fn(half2 v) { \ 1784 return fn(convert_float2(v)); \ 1785} \ 1786extern half __attribute__((overloadable)) fn(half3 v) { \ 1787 return fn(convert_float3(v)); \ 1788} \ 1789extern half __attribute__((overloadable)) fn(half4 v) { \ 1790 return fn(convert_float4(v)); \ 1791} 1792 1793/* Define f16 functions of the form 1794 * half output = fn(HN input1, HN input2) 1795 * where HN is a scalar or vector half type. 1796 */ 1797#define H_FUNC_HN_HN(fn) \ 1798extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1799 return (half) fn((float) h1, (float) h2); \ 1800} \ 1801extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1802 return fn(convert_float2(v1), convert_float2(v2)); \ 1803} \ 1804extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1805 return fn(convert_float3(v1), convert_float3(v2)); \ 1806} \ 1807extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1808 return fn(convert_float4(v1), convert_float4(v2)); \ 1809} 1810 1811/* Define f16 functions of the form 1812 * HN output = fn(HN input1, HN input2) 1813 * where HN is a vector half type. The functions are defined to call the 1814 * scalar function of the same name. 1815 */ 1816#define SCALARIZE_HN_FUNC_HN_HN(fn) \ 1817extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1818 half2 ret; \ 1819 ret.x = fn(v1.x, v2.x); \ 1820 ret.y = fn(v1.y, v2.y); \ 1821 return ret; \ 1822} \ 1823extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1824 half3 ret; \ 1825 ret.x = fn(v1.x, v2.x); \ 1826 ret.y = fn(v1.y, v2.y); \ 1827 ret.z = fn(v1.z, v2.z); \ 1828 return ret; \ 1829} \ 1830extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1831 half4 ret; \ 1832 ret.x = fn(v1.x, v2.x); \ 1833 ret.y = fn(v1.y, v2.y); \ 1834 ret.z = fn(v1.z, v2.z); \ 1835 ret.w = fn(v1.w, v2.w); \ 1836 return ret; \ 1837} \ 1838 1839HN_FUNC_HN(acos); 1840HN_FUNC_HN(acosh); 1841HN_FUNC_HN(acospi); 1842HN_FUNC_HN(asin); 1843HN_FUNC_HN(asinh); 1844HN_FUNC_HN(asinpi); 1845HN_FUNC_HN(atan); 1846HN_FUNC_HN(atanh); 1847HN_FUNC_HN(atanpi); 1848HN_FUNC_HN_HN(atan2); 1849HN_FUNC_HN_HN(atan2pi); 1850 1851HN_FUNC_HN(cbrt); 1852HN_FUNC_HN(ceil); 1853 1854// TODO Add copysign 1855 1856HN_FUNC_HN(cos); 1857HN_FUNC_HN(cosh); 1858HN_FUNC_HN(cospi); 1859 1860extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) { 1861 half3 r; 1862 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1863 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1864 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1865 return r; 1866} 1867 1868extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) { 1869 half4 r; 1870 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1871 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1872 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1873 r.w = 0.f; 1874 return r; 1875} 1876 1877HN_FUNC_HN(degrees); 1878H_FUNC_HN_HN(distance); 1879H_FUNC_HN_HN(dot); 1880 1881HN_FUNC_HN(erf); 1882HN_FUNC_HN(erfc); 1883HN_FUNC_HN(exp); 1884HN_FUNC_HN(exp10); 1885HN_FUNC_HN(exp2); 1886HN_FUNC_HN(expm1); 1887 1888HN_FUNC_HN(fabs); 1889HN_FUNC_HN_HN(fdim); 1890HN_FUNC_HN(floor); 1891HN_FUNC_HN_HN_HN(fma); 1892HN_FUNC_HN_HN(fmax); 1893HN_FUNC_HN_H(fmax); 1894HN_FUNC_HN_HN(fmin); 1895HN_FUNC_HN_H(fmin); 1896HN_FUNC_HN_HN(fmod); 1897 1898// TODO Add (both variants) of fract 1899// TODO Add frexp 1900 1901HN_FUNC_HN_HN(hypot); 1902 1903// TODO Add ilogb 1904 1905HN_FUNC_HN_IN(ldexp); 1906extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) { 1907 return convert_half2(ldexp(convert_float2(v), exponent)); 1908} 1909extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) { 1910 return convert_half3(ldexp(convert_float3(v), exponent)); 1911} 1912extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) { 1913 return convert_half4(ldexp(convert_float4(v), exponent)); 1914} 1915 1916H_FUNC_HN(length); 1917HN_FUNC_HN(lgamma); 1918 1919extern half __attribute__((overloadable)) lgamma(half h, int *signp) { 1920 return (half) lgamma((float) h, signp); 1921} 1922extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) { 1923 return convert_half2(lgamma(convert_float2(v), signp)); 1924} 1925extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) { 1926 return convert_half3(lgamma(convert_float3(v), signp)); 1927} 1928extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) { 1929 return convert_half4(lgamma(convert_float4(v), signp)); 1930} 1931 1932HN_FUNC_HN(log); 1933HN_FUNC_HN(log10); 1934HN_FUNC_HN(log1p); 1935HN_FUNC_HN(log2); 1936HN_FUNC_HN(logb); 1937 1938HN_FUNC_HN_HN_HN(mad); 1939HN_FUNC_HN_HN(max); 1940HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff? 1941HN_FUNC_HN_HN(min); 1942HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff? 1943 1944extern half __attribute__((overloadable)) mix(half start, half stop, half amount) { 1945 return start + (stop - start) * amount; 1946} 1947extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) { 1948 return start + (stop - start) * amount; 1949} 1950extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) { 1951 return start + (stop - start) * amount; 1952} 1953extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) { 1954 return start + (stop - start) * amount; 1955} 1956extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) { 1957 return start + (stop - start) * amount; 1958} 1959extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) { 1960 return start + (stop - start) * amount; 1961} 1962extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) { 1963 return start + (stop - start) * amount; 1964} 1965 1966// TODO Define modf. Does it make sense to delegate to the float? 1967 1968half __attribute__((overloadable)) nan_half() { 1969 unsigned short nan_short = kHalfPositiveInfinity | 0x0200; 1970 half nan; 1971 SET_HALF_WORD(nan, nan_short); 1972 return nan; 1973} 1974 1975// TODO Add nextafter 1976 1977HN_FUNC_HN(normalize); 1978 1979HN_FUNC_HN_HN(pow); 1980HN_FUNC_HN_IN(pown); 1981HN_FUNC_HN_HN(powr); 1982HN_FUNC_HN(radians); 1983HN_FUNC_HN_HN(remainder); 1984 1985extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) { 1986 return (float) remquo((float) n, (float) d, quo); 1987} 1988extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) { 1989 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo)); 1990} 1991extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) { 1992 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo)); 1993} 1994extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) { 1995 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo)); 1996} 1997 1998HN_FUNC_HN(rint); 1999HN_FUNC_HN_IN(rootn); 2000HN_FUNC_HN(round); 2001HN_FUNC_HN(rsqrt); 2002 2003extern half __attribute__((overloadable)) sign(half h) { 2004 if (h > 0) return (half) 1.f; 2005 if (h < 0) return (half) -1.f; 2006 return h; 2007} 2008extern half2 __attribute__((overloadable)) sign(half2 v) { 2009 half2 ret; 2010 ret.x = sign(v.x); 2011 ret.y = sign(v.y); 2012 return ret; 2013} 2014extern half3 __attribute__((overloadable)) sign(half3 v) { 2015 half3 ret; 2016 ret.x = sign(v.x); 2017 ret.y = sign(v.y); 2018 ret.z = sign(v.z); 2019 return ret; 2020} 2021extern half4 __attribute__((overloadable)) sign(half4 v) { 2022 half4 ret; 2023 ret.x = sign(v.x); 2024 ret.y = sign(v.y); 2025 ret.z = sign(v.z); 2026 ret.w = sign(v.w); 2027 return ret; 2028} 2029 2030HN_FUNC_HN(sin); 2031 2032extern half __attribute__((overloadable)) sincos(half v, half *cosptr) { 2033 *cosptr = cos(v); 2034 return sin(v); 2035} 2036// TODO verify if LLVM eliminates the duplicate convert_float2 2037extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) { 2038 *cosptr = cos(v); 2039 return sin(v); 2040} 2041extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) { 2042 *cosptr = cos(v); 2043 return sin(v); 2044} 2045extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) { 2046 *cosptr = cos(v); 2047 return sin(v); 2048} 2049 2050HN_FUNC_HN(sinh); 2051HN_FUNC_HN(sinpi); 2052HN_FUNC_HN(sqrt); 2053 2054extern half __attribute__((overloadable)) step(half edge, half v) { 2055 return (v < edge) ? 0.f : 1.f; 2056} 2057extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) { 2058 half2 r; 2059 r.x = (v.x < edge.x) ? 0.f : 1.f; 2060 r.y = (v.y < edge.y) ? 0.f : 1.f; 2061 return r; 2062} 2063extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) { 2064 half3 r; 2065 r.x = (v.x < edge.x) ? 0.f : 1.f; 2066 r.y = (v.y < edge.y) ? 0.f : 1.f; 2067 r.z = (v.z < edge.z) ? 0.f : 1.f; 2068 return r; 2069} 2070extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) { 2071 half4 r; 2072 r.x = (v.x < edge.x) ? 0.f : 1.f; 2073 r.y = (v.y < edge.y) ? 0.f : 1.f; 2074 r.z = (v.z < edge.z) ? 0.f : 1.f; 2075 r.w = (v.w < edge.w) ? 0.f : 1.f; 2076 return r; 2077} 2078extern half2 __attribute__((overloadable)) step(half2 edge, half v) { 2079 half2 r; 2080 r.x = (v < edge.x) ? 0.f : 1.f; 2081 r.y = (v < edge.y) ? 0.f : 1.f; 2082 return r; 2083} 2084extern half3 __attribute__((overloadable)) step(half3 edge, half v) { 2085 half3 r; 2086 r.x = (v < edge.x) ? 0.f : 1.f; 2087 r.y = (v < edge.y) ? 0.f : 1.f; 2088 r.z = (v < edge.z) ? 0.f : 1.f; 2089 return r; 2090} 2091extern half4 __attribute__((overloadable)) step(half4 edge, half v) { 2092 half4 r; 2093 r.x = (v < edge.x) ? 0.f : 1.f; 2094 r.y = (v < edge.y) ? 0.f : 1.f; 2095 r.z = (v < edge.z) ? 0.f : 1.f; 2096 r.w = (v < edge.w) ? 0.f : 1.f; 2097 return r; 2098} 2099extern half2 __attribute__((overloadable)) step(half edge, half2 v) { 2100 half2 r; 2101 r.x = (v.x < edge) ? 0.f : 1.f; 2102 r.y = (v.y < edge) ? 0.f : 1.f; 2103 return r; 2104} 2105extern half3 __attribute__((overloadable)) step(half edge, half3 v) { 2106 half3 r; 2107 r.x = (v.x < edge) ? 0.f : 1.f; 2108 r.y = (v.y < edge) ? 0.f : 1.f; 2109 r.z = (v.z < edge) ? 0.f : 1.f; 2110 return r; 2111} 2112extern half4 __attribute__((overloadable)) step(half edge, half4 v) { 2113 half4 r; 2114 r.x = (v.x < edge) ? 0.f : 1.f; 2115 r.y = (v.y < edge) ? 0.f : 1.f; 2116 r.z = (v.z < edge) ? 0.f : 1.f; 2117 r.w = (v.w < edge) ? 0.f : 1.f; 2118 return r; 2119} 2120 2121HN_FUNC_HN(tan); 2122HN_FUNC_HN(tanh); 2123HN_FUNC_HN(tanpi); 2124HN_FUNC_HN(tgamma); 2125HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation? 2126 2127HN_FUNC_HN(native_acos); 2128HN_FUNC_HN(native_acosh); 2129HN_FUNC_HN(native_acospi); 2130HN_FUNC_HN(native_asin); 2131HN_FUNC_HN(native_asinh); 2132HN_FUNC_HN(native_asinpi); 2133HN_FUNC_HN(native_atan); 2134HN_FUNC_HN(native_atanh); 2135HN_FUNC_HN(native_atanpi); 2136HN_FUNC_HN_HN(native_atan2); 2137HN_FUNC_HN_HN(native_atan2pi); 2138 2139HN_FUNC_HN(native_cbrt); 2140HN_FUNC_HN(native_cos); 2141HN_FUNC_HN(native_cosh); 2142HN_FUNC_HN(native_cospi); 2143 2144H_FUNC_HN_HN(native_distance); 2145HN_FUNC_HN_HN(native_divide); 2146 2147HN_FUNC_HN(native_exp); 2148HN_FUNC_HN(native_exp10); 2149HN_FUNC_HN(native_exp2); 2150HN_FUNC_HN(native_expm1); 2151 2152HN_FUNC_HN_HN(native_hypot); 2153H_FUNC_HN(native_length); 2154 2155HN_FUNC_HN(native_log); 2156HN_FUNC_HN(native_log10); 2157HN_FUNC_HN(native_log1p); 2158HN_FUNC_HN(native_log2); 2159 2160HN_FUNC_HN(native_normalize); 2161 2162HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half? 2163 2164HN_FUNC_HN(native_recip); 2165HN_FUNC_HN_IN(native_rootn); 2166HN_FUNC_HN(native_rsqrt); 2167 2168HN_FUNC_HN(native_sin); 2169 2170extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) { 2171 return sincos(v, cosptr); 2172} 2173extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) { 2174 return sincos(v, cosptr); 2175} 2176extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) { 2177 return sincos(v, cosptr); 2178} 2179extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) { 2180 return sincos(v, cosptr); 2181} 2182 2183HN_FUNC_HN(native_sinh); 2184HN_FUNC_HN(native_sinpi); 2185HN_FUNC_HN(native_sqrt); 2186 2187HN_FUNC_HN(native_tan); 2188HN_FUNC_HN(native_tanh); 2189HN_FUNC_HN(native_tanpi); 2190 2191#undef HN_FUNC_HN 2192#undef HN_FUNC_HN_HN 2193#undef HN_FUNC_HN_H 2194#undef HN_FUNC_HN_HN_HN 2195#undef HN_FUNC_HN_IN 2196#undef H_FUNC_HN 2197#undef H_FUNC_HN_HN 2198#undef SCALARIZE_HN_FUNC_HN_HN 2199 2200// exports unavailable mathlib functions to compat lib 2201 2202#ifdef RS_COMPATIBILITY_LIB 2203 2204// !!! DANGER !!! 2205// These functions are potentially missing on older Android versions. 2206// Work around the issue by supplying our own variants. 2207// !!! DANGER !!! 2208 2209// The logbl() implementation is taken from the latest bionic/, since 2210// double == long double on Android. 2211extern "C" long double logbl(long double x) { return logb(x); } 2212 2213// __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just 2214// pick the simplest implementation based on the ARM EABI doc. 2215extern "C" int __aeabi_idiv0(int v) { return v; } 2216 2217#endif // compatibility lib 2218