rs_cl.c revision f9760483073d9f452e4701fbf367dc518f7e6531
1#include "rs_core.rsh" 2#include "rs_f16_util.h" 3 4extern float2 __attribute__((overloadable)) convert_float2(int2 c); 5extern float3 __attribute__((overloadable)) convert_float3(int3 c); 6extern float4 __attribute__((overloadable)) convert_float4(int4 c); 7 8extern int2 __attribute__((overloadable)) convert_int2(float2 c); 9extern int3 __attribute__((overloadable)) convert_int3(float3 c); 10extern int4 __attribute__((overloadable)) convert_int4(float4 c); 11 12 13extern float __attribute__((overloadable)) fmin(float v, float v2); 14extern float2 __attribute__((overloadable)) fmin(float2 v, float v2); 15extern float3 __attribute__((overloadable)) fmin(float3 v, float v2); 16extern float4 __attribute__((overloadable)) fmin(float4 v, float v2); 17 18extern float __attribute__((overloadable)) fmax(float v, float v2); 19extern float2 __attribute__((overloadable)) fmax(float2 v, float v2); 20extern float3 __attribute__((overloadable)) fmax(float3 v, float v2); 21extern float4 __attribute__((overloadable)) fmax(float4 v, float v2); 22 23// Float ops, 6.11.2 24 25#define FN_FUNC_FN(fnc) \ 26extern float2 __attribute__((overloadable)) fnc(float2 v) { \ 27 float2 r; \ 28 r.x = fnc(v.x); \ 29 r.y = fnc(v.y); \ 30 return r; \ 31} \ 32extern float3 __attribute__((overloadable)) fnc(float3 v) { \ 33 float3 r; \ 34 r.x = fnc(v.x); \ 35 r.y = fnc(v.y); \ 36 r.z = fnc(v.z); \ 37 return r; \ 38} \ 39extern float4 __attribute__((overloadable)) fnc(float4 v) { \ 40 float4 r; \ 41 r.x = fnc(v.x); \ 42 r.y = fnc(v.y); \ 43 r.z = fnc(v.z); \ 44 r.w = fnc(v.w); \ 45 return r; \ 46} 47 48#define IN_FUNC_FN(fnc) \ 49extern int2 __attribute__((overloadable)) fnc(float2 v) { \ 50 int2 r; \ 51 r.x = fnc(v.x); \ 52 r.y = fnc(v.y); \ 53 return r; \ 54} \ 55extern int3 __attribute__((overloadable)) fnc(float3 v) { \ 56 int3 r; \ 57 r.x = fnc(v.x); \ 58 r.y = fnc(v.y); \ 59 r.z = fnc(v.z); \ 60 return r; \ 61} \ 62extern int4 __attribute__((overloadable)) fnc(float4 v) { \ 63 int4 r; \ 64 r.x = fnc(v.x); \ 65 r.y = fnc(v.y); \ 66 r.z = fnc(v.z); \ 67 r.w = fnc(v.w); \ 68 return r; \ 69} 70 71#define FN_FUNC_FN_FN(fnc) \ 72extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \ 73 float2 r; \ 74 r.x = fnc(v1.x, v2.x); \ 75 r.y = fnc(v1.y, v2.y); \ 76 return r; \ 77} \ 78extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \ 79 float3 r; \ 80 r.x = fnc(v1.x, v2.x); \ 81 r.y = fnc(v1.y, v2.y); \ 82 r.z = fnc(v1.z, v2.z); \ 83 return r; \ 84} \ 85extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \ 86 float4 r; \ 87 r.x = fnc(v1.x, v2.x); \ 88 r.y = fnc(v1.y, v2.y); \ 89 r.z = fnc(v1.z, v2.z); \ 90 r.w = fnc(v1.w, v2.w); \ 91 return r; \ 92} 93 94#define FN_FUNC_FN_F(fnc) \ 95extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \ 96 float2 r; \ 97 r.x = fnc(v1.x, v2); \ 98 r.y = fnc(v1.y, v2); \ 99 return r; \ 100} \ 101extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \ 102 float3 r; \ 103 r.x = fnc(v1.x, v2); \ 104 r.y = fnc(v1.y, v2); \ 105 r.z = fnc(v1.z, v2); \ 106 return r; \ 107} \ 108extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \ 109 float4 r; \ 110 r.x = fnc(v1.x, v2); \ 111 r.y = fnc(v1.y, v2); \ 112 r.z = fnc(v1.z, v2); \ 113 r.w = fnc(v1.w, v2); \ 114 return r; \ 115} 116 117#define FN_FUNC_FN_IN(fnc) \ 118extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \ 119 float2 r; \ 120 r.x = fnc(v1.x, v2.x); \ 121 r.y = fnc(v1.y, v2.y); \ 122 return r; \ 123} \ 124extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \ 125 float3 r; \ 126 r.x = fnc(v1.x, v2.x); \ 127 r.y = fnc(v1.y, v2.y); \ 128 r.z = fnc(v1.z, v2.z); \ 129 return r; \ 130} \ 131extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \ 132 float4 r; \ 133 r.x = fnc(v1.x, v2.x); \ 134 r.y = fnc(v1.y, v2.y); \ 135 r.z = fnc(v1.z, v2.z); \ 136 r.w = fnc(v1.w, v2.w); \ 137 return r; \ 138} 139 140#define FN_FUNC_FN_I(fnc) \ 141extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \ 142 float2 r; \ 143 r.x = fnc(v1.x, v2); \ 144 r.y = fnc(v1.y, v2); \ 145 return r; \ 146} \ 147extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \ 148 float3 r; \ 149 r.x = fnc(v1.x, v2); \ 150 r.y = fnc(v1.y, v2); \ 151 r.z = fnc(v1.z, v2); \ 152 return r; \ 153} \ 154extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \ 155 float4 r; \ 156 r.x = fnc(v1.x, v2); \ 157 r.y = fnc(v1.y, v2); \ 158 r.z = fnc(v1.z, v2); \ 159 r.w = fnc(v1.w, v2); \ 160 return r; \ 161} 162 163#define FN_FUNC_FN_PFN(fnc) \ 164extern float2 __attribute__((overloadable)) \ 165 fnc(float2 v1, float2 *v2) { \ 166 float2 r; \ 167 float t[2]; \ 168 r.x = fnc(v1.x, &t[0]); \ 169 r.y = fnc(v1.y, &t[1]); \ 170 v2->x = t[0]; \ 171 v2->y = t[1]; \ 172 return r; \ 173} \ 174extern float3 __attribute__((overloadable)) \ 175 fnc(float3 v1, float3 *v2) { \ 176 float3 r; \ 177 float t[3]; \ 178 r.x = fnc(v1.x, &t[0]); \ 179 r.y = fnc(v1.y, &t[1]); \ 180 r.z = fnc(v1.z, &t[2]); \ 181 v2->x = t[0]; \ 182 v2->y = t[1]; \ 183 v2->z = t[2]; \ 184 return r; \ 185} \ 186extern float4 __attribute__((overloadable)) \ 187 fnc(float4 v1, float4 *v2) { \ 188 float4 r; \ 189 float t[4]; \ 190 r.x = fnc(v1.x, &t[0]); \ 191 r.y = fnc(v1.y, &t[1]); \ 192 r.z = fnc(v1.z, &t[2]); \ 193 r.w = fnc(v1.w, &t[3]); \ 194 v2->x = t[0]; \ 195 v2->y = t[1]; \ 196 v2->z = t[2]; \ 197 v2->w = t[3]; \ 198 return r; \ 199} 200 201#define FN_FUNC_FN_PIN(fnc) \ 202extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \ 203 float2 r; \ 204 int t[2]; \ 205 r.x = fnc(v1.x, &t[0]); \ 206 r.y = fnc(v1.y, &t[1]); \ 207 v2->x = t[0]; \ 208 v2->y = t[1]; \ 209 return r; \ 210} \ 211extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \ 212 float3 r; \ 213 int t[3]; \ 214 r.x = fnc(v1.x, &t[0]); \ 215 r.y = fnc(v1.y, &t[1]); \ 216 r.z = fnc(v1.z, &t[2]); \ 217 v2->x = t[0]; \ 218 v2->y = t[1]; \ 219 v2->z = t[2]; \ 220 return r; \ 221} \ 222extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \ 223 float4 r; \ 224 int t[4]; \ 225 r.x = fnc(v1.x, &t[0]); \ 226 r.y = fnc(v1.y, &t[1]); \ 227 r.z = fnc(v1.z, &t[2]); \ 228 r.w = fnc(v1.w, &t[3]); \ 229 v2->x = t[0]; \ 230 v2->y = t[1]; \ 231 v2->z = t[2]; \ 232 v2->w = t[3]; \ 233 return r; \ 234} 235 236#define FN_FUNC_FN_FN_FN(fnc) \ 237extern float2 __attribute__((overloadable)) \ 238 fnc(float2 v1, float2 v2, float2 v3) { \ 239 float2 r; \ 240 r.x = fnc(v1.x, v2.x, v3.x); \ 241 r.y = fnc(v1.y, v2.y, v3.y); \ 242 return r; \ 243} \ 244extern float3 __attribute__((overloadable)) \ 245 fnc(float3 v1, float3 v2, float3 v3) { \ 246 float3 r; \ 247 r.x = fnc(v1.x, v2.x, v3.x); \ 248 r.y = fnc(v1.y, v2.y, v3.y); \ 249 r.z = fnc(v1.z, v2.z, v3.z); \ 250 return r; \ 251} \ 252extern float4 __attribute__((overloadable)) \ 253 fnc(float4 v1, float4 v2, float4 v3) { \ 254 float4 r; \ 255 r.x = fnc(v1.x, v2.x, v3.x); \ 256 r.y = fnc(v1.y, v2.y, v3.y); \ 257 r.z = fnc(v1.z, v2.z, v3.z); \ 258 r.w = fnc(v1.w, v2.w, v3.w); \ 259 return r; \ 260} 261 262#define FN_FUNC_FN_FN_PIN(fnc) \ 263extern float2 __attribute__((overloadable)) \ 264 fnc(float2 v1, float2 v2, int2 *v3) { \ 265 float2 r; \ 266 int t[2]; \ 267 r.x = fnc(v1.x, v2.x, &t[0]); \ 268 r.y = fnc(v1.y, v2.y, &t[1]); \ 269 v3->x = t[0]; \ 270 v3->y = t[1]; \ 271 return r; \ 272} \ 273extern float3 __attribute__((overloadable)) \ 274 fnc(float3 v1, float3 v2, int3 *v3) { \ 275 float3 r; \ 276 int t[3]; \ 277 r.x = fnc(v1.x, v2.x, &t[0]); \ 278 r.y = fnc(v1.y, v2.y, &t[1]); \ 279 r.z = fnc(v1.z, v2.z, &t[2]); \ 280 v3->x = t[0]; \ 281 v3->y = t[1]; \ 282 v3->z = t[2]; \ 283 return r; \ 284} \ 285extern float4 __attribute__((overloadable)) \ 286 fnc(float4 v1, float4 v2, int4 *v3) { \ 287 float4 r; \ 288 int t[4]; \ 289 r.x = fnc(v1.x, v2.x, &t[0]); \ 290 r.y = fnc(v1.y, v2.y, &t[1]); \ 291 r.z = fnc(v1.z, v2.z, &t[2]); \ 292 r.w = fnc(v1.w, v2.w, &t[3]); \ 293 v3->x = t[0]; \ 294 v3->y = t[1]; \ 295 v3->z = t[2]; \ 296 v3->w = t[3]; \ 297 return r; \ 298} 299 300static const int iposinf = 0x7f800000; 301static const int ineginf = 0xff800000; 302 303static const float posinf() { 304 float f = *((float*)&iposinf); 305 return f; 306} 307 308static const float neginf() { 309 float f = *((float*)&ineginf); 310 return f; 311} 312 313static bool isinf(float f) { 314 int i = *((int*)(void*)&f); 315 return (i == iposinf) || (i == ineginf); 316} 317 318static bool isnan(float f) { 319 int i = *((int*)(void*)&f); 320 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff)); 321} 322 323static bool isposzero(float f) { 324 int i = *((int*)(void*)&f); 325 return (i == 0x00000000); 326} 327 328static bool isnegzero(float f) { 329 int i = *((int*)(void*)&f); 330 return (i == 0x80000000); 331} 332 333static bool iszero(float f) { 334 return isposzero(f) || isnegzero(f); 335} 336 337 338extern float __attribute__((overloadable)) SC_acosf(float); 339float __attribute__((overloadable)) acos(float v) { 340 return SC_acosf(v); 341} 342FN_FUNC_FN(acos) 343 344extern float __attribute__((overloadable)) SC_acoshf(float); 345float __attribute__((overloadable)) acosh(float v) { 346 return SC_acoshf(v); 347} 348FN_FUNC_FN(acosh) 349 350 351extern float __attribute__((overloadable)) acospi(float v) { 352 return acos(v) / M_PI; 353} 354FN_FUNC_FN(acospi) 355 356extern float __attribute__((overloadable)) SC_asinf(float); 357float __attribute__((overloadable)) asin(float v) { 358 return SC_asinf(v); 359} 360FN_FUNC_FN(asin) 361 362extern float __attribute__((overloadable)) SC_asinhf(float); 363float __attribute__((overloadable)) asinh(float v) { 364 return SC_asinhf(v); 365} 366FN_FUNC_FN(asinh) 367 368extern float __attribute__((overloadable)) asinpi(float v) { 369 return asin(v) / M_PI; 370} 371FN_FUNC_FN(asinpi) 372 373extern float __attribute__((overloadable)) SC_atanf(float); 374float __attribute__((overloadable)) atan(float v) { 375 return SC_atanf(v); 376} 377FN_FUNC_FN(atan) 378 379extern float __attribute__((overloadable)) SC_atan2f(float, float); 380float __attribute__((overloadable)) atan2(float v1, float v2) { 381 return SC_atan2f(v1, v2); 382} 383FN_FUNC_FN_FN(atan2) 384 385extern float __attribute__((overloadable)) SC_atanhf(float); 386float __attribute__((overloadable)) atanh(float v) { 387 return SC_atanhf(v); 388} 389FN_FUNC_FN(atanh) 390 391extern float __attribute__((overloadable)) atanpi(float v) { 392 return atan(v) / M_PI; 393} 394FN_FUNC_FN(atanpi) 395 396 397extern float __attribute__((overloadable)) atan2pi(float y, float x) { 398 return atan2(y, x) / M_PI; 399} 400FN_FUNC_FN_FN(atan2pi) 401 402extern float __attribute__((overloadable)) SC_cbrtf(float); 403float __attribute__((overloadable)) cbrt(float v) { 404 return SC_cbrtf(v); 405} 406FN_FUNC_FN(cbrt) 407 408extern float __attribute__((overloadable)) SC_ceilf(float); 409float __attribute__((overloadable)) ceil(float v) { 410 return SC_ceilf(v); 411} 412FN_FUNC_FN(ceil) 413 414extern float __attribute__((overloadable)) SC_copysignf(float, float); 415float __attribute__((overloadable)) copysign(float v1, float v2) { 416 return SC_copysignf(v1, v2); 417} 418FN_FUNC_FN_FN(copysign) 419 420extern float __attribute__((overloadable)) SC_cosf(float); 421float __attribute__((overloadable)) cos(float v) { 422 return SC_cosf(v); 423} 424FN_FUNC_FN(cos) 425 426extern float __attribute__((overloadable)) SC_coshf(float); 427float __attribute__((overloadable)) cosh(float v) { 428 return SC_coshf(v); 429} 430FN_FUNC_FN(cosh) 431 432extern float __attribute__((overloadable)) cospi(float v) { 433 return cos(v * M_PI); 434} 435FN_FUNC_FN(cospi) 436 437extern float __attribute__((overloadable)) SC_erfcf(float); 438float __attribute__((overloadable)) erfc(float v) { 439 return SC_erfcf(v); 440} 441FN_FUNC_FN(erfc) 442 443extern float __attribute__((overloadable)) SC_erff(float); 444float __attribute__((overloadable)) erf(float v) { 445 return SC_erff(v); 446} 447FN_FUNC_FN(erf) 448 449extern float __attribute__((overloadable)) SC_expf(float); 450float __attribute__((overloadable)) exp(float v) { 451 return SC_expf(v); 452} 453FN_FUNC_FN(exp) 454 455extern float __attribute__((overloadable)) SC_exp2f(float); 456float __attribute__((overloadable)) exp2(float v) { 457 return SC_exp2f(v); 458} 459FN_FUNC_FN(exp2) 460 461extern float __attribute__((overloadable)) pow(float, float); 462 463extern float __attribute__((overloadable)) exp10(float v) { 464 return exp2(v * 3.321928095f); 465} 466FN_FUNC_FN(exp10) 467 468extern float __attribute__((overloadable)) SC_expm1f(float); 469float __attribute__((overloadable)) expm1(float v) { 470 return SC_expm1f(v); 471} 472FN_FUNC_FN(expm1) 473 474extern float __attribute__((overloadable)) fabs(float v) { 475 int i = *((int*)(void*)&v) & 0x7fffffff; 476 return *((float*)(void*)&i); 477} 478FN_FUNC_FN(fabs) 479 480extern float __attribute__((overloadable)) SC_fdimf(float, float); 481float __attribute__((overloadable)) fdim(float v1, float v2) { 482 return SC_fdimf(v1, v2); 483} 484FN_FUNC_FN_FN(fdim) 485 486extern float __attribute__((overloadable)) SC_floorf(float); 487float __attribute__((overloadable)) floor(float v) { 488 return SC_floorf(v); 489} 490FN_FUNC_FN(floor) 491 492extern float __attribute__((overloadable)) SC_fmaf(float, float, float); 493float __attribute__((overloadable)) fma(float v1, float v2, float v3) { 494 return SC_fmaf(v1, v2, v3); 495} 496FN_FUNC_FN_FN_FN(fma) 497 498extern float __attribute__((overloadable)) SC_fminf(float, float); 499 500extern float __attribute__((overloadable)) SC_fmodf(float, float); 501float __attribute__((overloadable)) fmod(float v1, float v2) { 502 return SC_fmodf(v1, v2); 503} 504FN_FUNC_FN_FN(fmod) 505 506extern float __attribute__((overloadable)) fract(float v, float *iptr) { 507 int i = (int)floor(v); 508 if (iptr) { 509 iptr[0] = i; 510 } 511 return fmin(v - i, 0x1.fffffep-1f); 512} 513FN_FUNC_FN_PFN(fract) 514 515extern float __attribute__((const, overloadable)) fract(float v) { 516 float unused; 517 return fract(v, &unused); 518} 519FN_FUNC_FN(fract) 520 521extern float __attribute__((overloadable)) SC_frexpf(float, int *); 522float __attribute__((overloadable)) frexp(float v1, int* v2) { 523 return SC_frexpf(v1, v2); 524} 525FN_FUNC_FN_PIN(frexp) 526 527extern float __attribute__((overloadable)) SC_hypotf(float, float); 528float __attribute__((overloadable)) hypot(float v1, float v2) { 529 return SC_hypotf(v1, v2); 530} 531FN_FUNC_FN_FN(hypot) 532 533extern int __attribute__((overloadable)) SC_ilogbf(float); 534int __attribute__((overloadable)) ilogb(float v) { 535 return SC_ilogbf(v); 536} 537IN_FUNC_FN(ilogb) 538 539extern float __attribute__((overloadable)) SC_ldexpf(float, int); 540float __attribute__((overloadable)) ldexp(float v1, int v2) { 541 return SC_ldexpf(v1, v2); 542} 543FN_FUNC_FN_IN(ldexp) 544FN_FUNC_FN_I(ldexp) 545 546extern float __attribute__((overloadable)) SC_lgammaf(float); 547float __attribute__((overloadable)) lgamma(float v) { 548 return SC_lgammaf(v); 549} 550FN_FUNC_FN(lgamma) 551extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*); 552float __attribute__((overloadable)) lgamma(float v, int* ptr) { 553 return SC_lgammaf_r(v, ptr); 554} 555FN_FUNC_FN_PIN(lgamma) 556 557extern float __attribute__((overloadable)) SC_logf(float); 558float __attribute__((overloadable)) log(float v) { 559 return SC_logf(v); 560} 561FN_FUNC_FN(log) 562 563extern float __attribute__((overloadable)) SC_log10f(float); 564float __attribute__((overloadable)) log10(float v) { 565 return SC_log10f(v); 566} 567FN_FUNC_FN(log10) 568 569 570extern float __attribute__((overloadable)) log2(float v) { 571 return log10(v) * 3.321928095f; 572} 573FN_FUNC_FN(log2) 574 575extern float __attribute__((overloadable)) SC_log1pf(float); 576float __attribute__((overloadable)) log1p(float v) { 577 return SC_log1pf(v); 578} 579FN_FUNC_FN(log1p) 580 581extern float __attribute__((overloadable)) SC_logbf(float); 582float __attribute__((overloadable)) logb(float v) { 583 return SC_logbf(v); 584} 585FN_FUNC_FN(logb) 586 587extern float __attribute__((overloadable)) mad(float a, float b, float c) { 588 return a * b + c; 589} 590extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) { 591 return a * b + c; 592} 593extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) { 594 return a * b + c; 595} 596extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) { 597 return a * b + c; 598} 599 600extern float __attribute__((overloadable)) SC_modff(float, float *); 601float __attribute__((overloadable)) modf(float v1, float *v2) { 602 return SC_modff(v1, v2); 603} 604FN_FUNC_FN_PFN(modf); 605 606extern float __attribute__((overloadable)) nan(uint v) { 607 float f[1]; 608 uint32_t *ip = (uint32_t *)f; 609 *ip = v | 0x7fc00000; 610 return f[0]; 611} 612 613extern float __attribute__((overloadable)) SC_nextafterf(float, float); 614float __attribute__((overloadable)) nextafter(float v1, float v2) { 615 return SC_nextafterf(v1, v2); 616} 617FN_FUNC_FN_FN(nextafter) 618 619FN_FUNC_FN_FN(pow) 620 621extern float __attribute__((overloadable)) pown(float v, int p) { 622 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31). 623 * For very large ints, we'll lose whether the exponent is even or odd, making 624 * the selection of a correct sign incorrect. We correct this. Use copysign 625 * to handle the negative zero case. 626 */ 627 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f; 628 float f = pow(v, (float)p); 629 return copysign(f, sign); 630} 631FN_FUNC_FN_IN(pown) 632 633extern float __attribute__((overloadable)) powr(float v, float p) { 634 return pow(v, p); 635} 636extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) { 637 return pow(v, p); 638} 639extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) { 640 return pow(v, p); 641} 642extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) { 643 return pow(v, p); 644} 645 646extern float __attribute__((overloadable)) SC_remainderf(float, float); 647float __attribute__((overloadable)) remainder(float v1, float v2) { 648 return SC_remainderf(v1, v2); 649} 650FN_FUNC_FN_FN(remainder) 651 652extern float __attribute__((overloadable)) SC_remquof(float, float, int *); 653float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) { 654 return SC_remquof(v1, v2, v3); 655} 656FN_FUNC_FN_FN_PIN(remquo) 657 658extern float __attribute__((overloadable)) SC_rintf(float); 659float __attribute__((overloadable)) rint(float v) { 660 return SC_rintf(v); 661} 662FN_FUNC_FN(rint) 663 664extern float __attribute__((overloadable)) rootn(float v, int r) { 665 if (r == 0) { 666 return posinf(); 667 } 668 669 if (iszero(v)) { 670 if (r < 0) { 671 if (r & 1) { 672 return copysign(posinf(), v); 673 } else { 674 return posinf(); 675 } 676 } else { 677 if (r & 1) { 678 return copysign(0.f, v); 679 } else { 680 return 0.f; 681 } 682 } 683 } 684 685 if (!isinf(v) && !isnan(v) && (v < 0.f)) { 686 if (r & 1) { 687 return (-1.f * pow(-1.f * v, 1.f / r)); 688 } else { 689 return nan(0); 690 } 691 } 692 693 return pow(v, 1.f / r); 694} 695FN_FUNC_FN_IN(rootn); 696 697extern float __attribute__((overloadable)) SC_roundf(float); 698float __attribute__((overloadable)) round(float v) { 699 return SC_roundf(v); 700} 701FN_FUNC_FN(round) 702 703extern float __attribute__((overloadable)) SC_randf2(float, float); 704float __attribute__((overloadable)) rsRand(float min, float max) { 705 return SC_randf2(min, max); 706} 707 708 709extern float __attribute__((overloadable)) rsqrt(float v) { 710 return 1.f / sqrt(v); 711} 712 713#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 714// These functions must be defined here if we are not using the SSE 715// implementation, which includes when we are built as part of the 716// debug runtime (libclcore_debug.bc). 717FN_FUNC_FN(sqrt) 718#else 719extern float2 __attribute__((overloadable)) sqrt(float2); 720extern float3 __attribute__((overloadable)) sqrt(float3); 721extern float4 __attribute__((overloadable)) sqrt(float4); 722#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 723 724FN_FUNC_FN(rsqrt) 725 726extern float __attribute__((overloadable)) SC_sinf(float); 727float __attribute__((overloadable)) sin(float v) { 728 return SC_sinf(v); 729} 730FN_FUNC_FN(sin) 731 732extern float __attribute__((overloadable)) sincos(float v, float *cosptr) { 733 *cosptr = cos(v); 734 return sin(v); 735} 736extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) { 737 *cosptr = cos(v); 738 return sin(v); 739} 740extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) { 741 *cosptr = cos(v); 742 return sin(v); 743} 744extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) { 745 *cosptr = cos(v); 746 return sin(v); 747} 748 749extern float __attribute__((overloadable)) SC_sinhf(float); 750float __attribute__((overloadable)) sinh(float v) { 751 return SC_sinhf(v); 752} 753FN_FUNC_FN(sinh) 754 755extern float __attribute__((overloadable)) sinpi(float v) { 756 return sin(v * M_PI); 757} 758FN_FUNC_FN(sinpi) 759 760extern float __attribute__((overloadable)) SC_tanf(float); 761float __attribute__((overloadable)) tan(float v) { 762 return SC_tanf(v); 763} 764FN_FUNC_FN(tan) 765 766extern float __attribute__((overloadable)) SC_tanhf(float); 767float __attribute__((overloadable)) tanh(float v) { 768 return SC_tanhf(v); 769} 770FN_FUNC_FN(tanh) 771 772extern float __attribute__((overloadable)) tanpi(float v) { 773 return tan(v * M_PI); 774} 775FN_FUNC_FN(tanpi) 776 777 778extern float __attribute__((overloadable)) SC_tgammaf(float); 779float __attribute__((overloadable)) tgamma(float v) { 780 return SC_tgammaf(v); 781} 782FN_FUNC_FN(tgamma) 783 784extern float __attribute__((overloadable)) SC_truncf(float); 785float __attribute__((overloadable)) trunc(float v) { 786 return SC_truncf(v); 787} 788FN_FUNC_FN(trunc) 789 790// Int ops (partial), 6.11.3 791 792#define XN_FUNC_YN(typeout, fnc, typein) \ 793extern typeout __attribute__((overloadable)) fnc(typein); \ 794extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \ 795 typeout##2 r; \ 796 r.x = fnc(v.x); \ 797 r.y = fnc(v.y); \ 798 return r; \ 799} \ 800extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \ 801 typeout##3 r; \ 802 r.x = fnc(v.x); \ 803 r.y = fnc(v.y); \ 804 r.z = fnc(v.z); \ 805 return r; \ 806} \ 807extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \ 808 typeout##4 r; \ 809 r.x = fnc(v.x); \ 810 r.y = fnc(v.y); \ 811 r.z = fnc(v.z); \ 812 r.w = fnc(v.w); \ 813 return r; \ 814} 815 816 817#define UIN_FUNC_IN(fnc) \ 818XN_FUNC_YN(uchar, fnc, char) \ 819XN_FUNC_YN(ushort, fnc, short) \ 820XN_FUNC_YN(uint, fnc, int) 821 822#define IN_FUNC_IN(fnc) \ 823XN_FUNC_YN(uchar, fnc, uchar) \ 824XN_FUNC_YN(char, fnc, char) \ 825XN_FUNC_YN(ushort, fnc, ushort) \ 826XN_FUNC_YN(short, fnc, short) \ 827XN_FUNC_YN(uint, fnc, uint) \ 828XN_FUNC_YN(int, fnc, int) 829 830 831#define XN_FUNC_XN_XN_BODY(type, fnc, body) \ 832extern type __attribute__((overloadable)) \ 833 fnc(type v1, type v2) { \ 834 return body; \ 835} \ 836extern type##2 __attribute__((overloadable)) \ 837 fnc(type##2 v1, type##2 v2) { \ 838 type##2 r; \ 839 r.x = fnc(v1.x, v2.x); \ 840 r.y = fnc(v1.y, v2.y); \ 841 return r; \ 842} \ 843extern type##3 __attribute__((overloadable)) \ 844 fnc(type##3 v1, type##3 v2) { \ 845 type##3 r; \ 846 r.x = fnc(v1.x, v2.x); \ 847 r.y = fnc(v1.y, v2.y); \ 848 r.z = fnc(v1.z, v2.z); \ 849 return r; \ 850} \ 851extern type##4 __attribute__((overloadable)) \ 852 fnc(type##4 v1, type##4 v2) { \ 853 type##4 r; \ 854 r.x = fnc(v1.x, v2.x); \ 855 r.y = fnc(v1.y, v2.y); \ 856 r.z = fnc(v1.z, v2.z); \ 857 r.w = fnc(v1.w, v2.w); \ 858 return r; \ 859} 860 861#define IN_FUNC_IN_IN_BODY(fnc, body) \ 862XN_FUNC_XN_XN_BODY(uchar, fnc, body) \ 863XN_FUNC_XN_XN_BODY(char, fnc, body) \ 864XN_FUNC_XN_XN_BODY(ushort, fnc, body) \ 865XN_FUNC_XN_XN_BODY(short, fnc, body) \ 866XN_FUNC_XN_XN_BODY(uint, fnc, body) \ 867XN_FUNC_XN_XN_BODY(int, fnc, body) \ 868XN_FUNC_XN_XN_BODY(float, fnc, body) 869 870 871/** 872 * abs 873 */ 874extern uint32_t __attribute__((overloadable)) abs(int32_t v) { 875 if (v < 0) 876 return -v; 877 return v; 878} 879extern uint16_t __attribute__((overloadable)) abs(int16_t v) { 880 if (v < 0) 881 return -v; 882 return v; 883} 884extern uint8_t __attribute__((overloadable)) abs(int8_t v) { 885 if (v < 0) 886 return -v; 887 return v; 888} 889 890/** 891 * clz 892 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be 893 * expanded to 32 bits. For our smaller data types, we need to subtract off 894 * these unused top bits (that will be always be composed of zeros). 895 */ 896extern uint32_t __attribute__((overloadable)) clz(uint32_t v) { 897 return __builtin_clz(v); 898} 899extern uint16_t __attribute__((overloadable)) clz(uint16_t v) { 900 return __builtin_clz(v) - 16; 901} 902extern uint8_t __attribute__((overloadable)) clz(uint8_t v) { 903 return __builtin_clz(v) - 24; 904} 905extern int32_t __attribute__((overloadable)) clz(int32_t v) { 906 return __builtin_clz(v); 907} 908extern int16_t __attribute__((overloadable)) clz(int16_t v) { 909 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16; 910} 911extern int8_t __attribute__((overloadable)) clz(int8_t v) { 912 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24; 913} 914 915 916UIN_FUNC_IN(abs) 917IN_FUNC_IN(clz) 918 919 920// 6.11.4 921 922 923extern float __attribute__((overloadable)) degrees(float radians) { 924 return radians * (180.f / M_PI); 925} 926extern float2 __attribute__((overloadable)) degrees(float2 radians) { 927 return radians * (180.f / M_PI); 928} 929extern float3 __attribute__((overloadable)) degrees(float3 radians) { 930 return radians * (180.f / M_PI); 931} 932extern float4 __attribute__((overloadable)) degrees(float4 radians) { 933 return radians * (180.f / M_PI); 934} 935 936extern float __attribute__((overloadable)) mix(float start, float stop, float amount) { 937 return start + (stop - start) * amount; 938} 939extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) { 940 return start + (stop - start) * amount; 941} 942extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) { 943 return start + (stop - start) * amount; 944} 945extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) { 946 return start + (stop - start) * amount; 947} 948extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) { 949 return start + (stop - start) * amount; 950} 951extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) { 952 return start + (stop - start) * amount; 953} 954extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) { 955 return start + (stop - start) * amount; 956} 957 958extern float __attribute__((overloadable)) radians(float degrees) { 959 return degrees * (M_PI / 180.f); 960} 961extern float2 __attribute__((overloadable)) radians(float2 degrees) { 962 return degrees * (M_PI / 180.f); 963} 964extern float3 __attribute__((overloadable)) radians(float3 degrees) { 965 return degrees * (M_PI / 180.f); 966} 967extern float4 __attribute__((overloadable)) radians(float4 degrees) { 968 return degrees * (M_PI / 180.f); 969} 970 971extern float __attribute__((overloadable)) step(float edge, float v) { 972 return (v < edge) ? 0.f : 1.f; 973} 974extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) { 975 float2 r; 976 r.x = (v.x < edge.x) ? 0.f : 1.f; 977 r.y = (v.y < edge.y) ? 0.f : 1.f; 978 return r; 979} 980extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) { 981 float3 r; 982 r.x = (v.x < edge.x) ? 0.f : 1.f; 983 r.y = (v.y < edge.y) ? 0.f : 1.f; 984 r.z = (v.z < edge.z) ? 0.f : 1.f; 985 return r; 986} 987extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) { 988 float4 r; 989 r.x = (v.x < edge.x) ? 0.f : 1.f; 990 r.y = (v.y < edge.y) ? 0.f : 1.f; 991 r.z = (v.z < edge.z) ? 0.f : 1.f; 992 r.w = (v.w < edge.w) ? 0.f : 1.f; 993 return r; 994} 995extern float2 __attribute__((overloadable)) step(float2 edge, float v) { 996 float2 r; 997 r.x = (v < edge.x) ? 0.f : 1.f; 998 r.y = (v < edge.y) ? 0.f : 1.f; 999 return r; 1000} 1001extern float3 __attribute__((overloadable)) step(float3 edge, float v) { 1002 float3 r; 1003 r.x = (v < edge.x) ? 0.f : 1.f; 1004 r.y = (v < edge.y) ? 0.f : 1.f; 1005 r.z = (v < edge.z) ? 0.f : 1.f; 1006 return r; 1007} 1008extern float4 __attribute__((overloadable)) step(float4 edge, float v) { 1009 float4 r; 1010 r.x = (v < edge.x) ? 0.f : 1.f; 1011 r.y = (v < edge.y) ? 0.f : 1.f; 1012 r.z = (v < edge.z) ? 0.f : 1.f; 1013 r.w = (v < edge.w) ? 0.f : 1.f; 1014 return r; 1015} 1016extern float2 __attribute__((overloadable)) step(float edge, float2 v) { 1017 float2 r; 1018 r.x = (v.x < edge) ? 0.f : 1.f; 1019 r.y = (v.y < edge) ? 0.f : 1.f; 1020 return r; 1021} 1022extern float3 __attribute__((overloadable)) step(float edge, float3 v) { 1023 float3 r; 1024 r.x = (v.x < edge) ? 0.f : 1.f; 1025 r.y = (v.y < edge) ? 0.f : 1.f; 1026 r.z = (v.z < edge) ? 0.f : 1.f; 1027 return r; 1028} 1029extern float4 __attribute__((overloadable)) step(float edge, float4 v) { 1030 float4 r; 1031 r.x = (v.x < edge) ? 0.f : 1.f; 1032 r.y = (v.y < edge) ? 0.f : 1.f; 1033 r.z = (v.z < edge) ? 0.f : 1.f; 1034 r.w = (v.w < edge) ? 0.f : 1.f; 1035 return r; 1036} 1037 1038extern float __attribute__((overloadable)) sign(float v) { 1039 if (v > 0) return 1.f; 1040 if (v < 0) return -1.f; 1041 return v; 1042} 1043FN_FUNC_FN(sign) 1044 1045 1046// 6.11.5 1047extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) { 1048 float3 r; 1049 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1050 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1051 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1052 return r; 1053} 1054 1055extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) { 1056 float4 r; 1057 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1058 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1059 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1060 r.w = 0.f; 1061 return r; 1062} 1063 1064#if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 1065// These functions must be defined here if we are not using the SSE 1066// implementation, which includes when we are built as part of the 1067// debug runtime (libclcore_debug.bc). 1068 1069extern float __attribute__((overloadable)) dot(float lhs, float rhs) { 1070 return lhs * rhs; 1071} 1072extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) { 1073 return lhs.x*rhs.x + lhs.y*rhs.y; 1074} 1075extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) { 1076 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z; 1077} 1078extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) { 1079 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w; 1080} 1081 1082extern float __attribute__((overloadable)) length(float v) { 1083 return fabs(v); 1084} 1085extern float __attribute__((overloadable)) length(float2 v) { 1086 return sqrt(v.x*v.x + v.y*v.y); 1087} 1088extern float __attribute__((overloadable)) length(float3 v) { 1089 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1090} 1091extern float __attribute__((overloadable)) length(float4 v) { 1092 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1093} 1094 1095#else 1096 1097extern float __attribute__((overloadable)) length(float v); 1098extern float __attribute__((overloadable)) length(float2 v); 1099extern float __attribute__((overloadable)) length(float3 v); 1100extern float __attribute__((overloadable)) length(float4 v); 1101 1102#endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) 1103 1104extern float __attribute__((overloadable)) distance(float lhs, float rhs) { 1105 return length(lhs - rhs); 1106} 1107extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) { 1108 return length(lhs - rhs); 1109} 1110extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) { 1111 return length(lhs - rhs); 1112} 1113extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) { 1114 return length(lhs - rhs); 1115} 1116 1117/* For the normalization functions, vectors of length 0 should simply be 1118 * returned (i.e. all the components of that vector are 0). 1119 */ 1120extern float __attribute__((overloadable)) normalize(float v) { 1121 if (v == 0.0f) { 1122 return 0.0f; 1123 } else if (v < 0.0f) { 1124 return -1.0f; 1125 } else { 1126 return 1.0f; 1127 } 1128} 1129extern float2 __attribute__((overloadable)) normalize(float2 v) { 1130 float l = length(v); 1131 return l == 0.0f ? v : v / l; 1132} 1133extern float3 __attribute__((overloadable)) normalize(float3 v) { 1134 float l = length(v); 1135 return l == 0.0f ? v : v / l; 1136} 1137extern float4 __attribute__((overloadable)) normalize(float4 v) { 1138 float l = length(v); 1139 return l == 0.0f ? v : v / l; 1140} 1141 1142extern float __attribute__((overloadable)) half_sqrt(float v) { 1143 return sqrt(v); 1144} 1145FN_FUNC_FN(half_sqrt) 1146 1147extern float __attribute__((overloadable)) fast_length(float v) { 1148 return fabs(v); 1149} 1150extern float __attribute__((overloadable)) fast_length(float2 v) { 1151 return half_sqrt(v.x*v.x + v.y*v.y); 1152} 1153extern float __attribute__((overloadable)) fast_length(float3 v) { 1154 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1155} 1156extern float __attribute__((overloadable)) fast_length(float4 v) { 1157 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1158} 1159 1160extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) { 1161 return fast_length(lhs - rhs); 1162} 1163extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) { 1164 return fast_length(lhs - rhs); 1165} 1166extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) { 1167 return fast_length(lhs - rhs); 1168} 1169extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) { 1170 return fast_length(lhs - rhs); 1171} 1172 1173extern float __attribute__((overloadable)) half_rsqrt(float); 1174 1175/* For the normalization functions, vectors of length 0 should simply be 1176 * returned (i.e. all the components of that vector are 0). 1177 */ 1178extern float __attribute__((overloadable)) fast_normalize(float v) { 1179 if (v == 0.0f) { 1180 return 0.0f; 1181 } else if (v < 0.0f) { 1182 return -1.0f; 1183 } else { 1184 return 1.0f; 1185 } 1186} 1187// If the length is 0, then rlength should be NaN. 1188extern float2 __attribute__((overloadable)) fast_normalize(float2 v) { 1189 float rlength = half_rsqrt(v.x*v.x + v.y*v.y); 1190 return (rlength == rlength) ? v * rlength : v; 1191} 1192extern float3 __attribute__((overloadable)) fast_normalize(float3 v) { 1193 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z); 1194 return (rlength == rlength) ? v * rlength : v; 1195} 1196extern float4 __attribute__((overloadable)) fast_normalize(float4 v) { 1197 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w); 1198 return (rlength == rlength) ? v * rlength : v; 1199} 1200 1201extern float __attribute__((overloadable)) half_recip(float v) { 1202 return 1.f / v; 1203} 1204 1205/* 1206extern float __attribute__((overloadable)) approx_atan(float x) { 1207 if (x == 0.f) 1208 return 0.f; 1209 if (x < 0.f) 1210 return -1.f * approx_atan(-1.f * x); 1211 if (x > 1.f) 1212 return M_PI_2 - approx_atan(approx_recip(x)); 1213 return x * approx_recip(1.f + 0.28f * x*x); 1214} 1215FN_FUNC_FN(approx_atan) 1216*/ 1217 1218typedef union 1219{ 1220 float fv; 1221 int32_t iv; 1222} ieee_float_shape_type; 1223 1224/* Get a 32 bit int from a float. */ 1225 1226#define GET_FLOAT_WORD(i,d) \ 1227do { \ 1228 ieee_float_shape_type gf_u; \ 1229 gf_u.fv = (d); \ 1230 (i) = gf_u.iv; \ 1231} while (0) 1232 1233/* Set a float from a 32 bit int. */ 1234 1235#define SET_FLOAT_WORD(d,i) \ 1236do { \ 1237 ieee_float_shape_type sf_u; \ 1238 sf_u.iv = (i); \ 1239 (d) = sf_u.fv; \ 1240} while (0) 1241 1242 1243 1244// Valid -125 to 125 1245extern float __attribute__((overloadable)) native_exp2(float v) { 1246 int32_t iv = (int)v; 1247 int32_t x = iv + (iv >> 31); // ~floor(v) 1248 float r = (v - x); 1249 1250 float fo; 1251 SET_FLOAT_WORD(fo, (x + 127) << 23); 1252 1253 r *= 0.694f; // ~ log(e) / log(2) 1254 float r2 = r*r; 1255 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1256 return fo * adj; 1257} 1258 1259extern float2 __attribute__((overloadable)) native_exp2(float2 v) { 1260 int2 iv = convert_int2(v); 1261 int2 x = iv + (iv >> (int2)31);//floor(v); 1262 float2 r = (v - convert_float2(x)); 1263 1264 x += 127; 1265 1266 float2 fo = (float2)(x << (int2)23); 1267 1268 r *= 0.694f; // ~ log(e) / log(2) 1269 float2 r2 = r*r; 1270 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1271 return fo * adj; 1272} 1273 1274extern float4 __attribute__((overloadable)) native_exp2(float4 v) { 1275 int4 iv = convert_int4(v); 1276 int4 x = iv + (iv >> (int4)31);//floor(v); 1277 float4 r = (v - convert_float4(x)); 1278 1279 x += 127; 1280 1281 float4 fo = (float4)(x << (int4)23); 1282 1283 r *= 0.694f; // ~ log(e) / log(2) 1284 float4 r2 = r*r; 1285 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f); 1286 return fo * adj; 1287} 1288 1289extern float3 __attribute__((overloadable)) native_exp2(float3 v) { 1290 float4 t = 1.f; 1291 t.xyz = v; 1292 return native_exp2(t).xyz; 1293} 1294 1295 1296extern float __attribute__((overloadable)) native_exp(float v) { 1297 return native_exp2(v * 1.442695041f); 1298} 1299extern float2 __attribute__((overloadable)) native_exp(float2 v) { 1300 return native_exp2(v * 1.442695041f); 1301} 1302extern float3 __attribute__((overloadable)) native_exp(float3 v) { 1303 return native_exp2(v * 1.442695041f); 1304} 1305extern float4 __attribute__((overloadable)) native_exp(float4 v) { 1306 return native_exp2(v * 1.442695041f); 1307} 1308 1309extern float __attribute__((overloadable)) native_exp10(float v) { 1310 return native_exp2(v * 3.321928095f); 1311} 1312extern float2 __attribute__((overloadable)) native_exp10(float2 v) { 1313 return native_exp2(v * 3.321928095f); 1314} 1315extern float3 __attribute__((overloadable)) native_exp10(float3 v) { 1316 return native_exp2(v * 3.321928095f); 1317} 1318extern float4 __attribute__((overloadable)) native_exp10(float4 v) { 1319 return native_exp2(v * 3.321928095f); 1320} 1321 1322extern float __attribute__((overloadable)) native_log2(float v) { 1323 int32_t ibits; 1324 GET_FLOAT_WORD(ibits, v); 1325 1326 int32_t e = (ibits >> 23) & 0xff; 1327 1328 ibits &= 0x7fffff; 1329 ibits |= 127 << 23; 1330 1331 float ir; 1332 SET_FLOAT_WORD(ir, ibits); 1333 ir -= 1.5f; 1334 float ir2 = ir*ir; 1335 float adj2 = (0.405465108f / 0.693147181f) + 1336 ((0.666666667f / 0.693147181f) * ir) - 1337 ((0.222222222f / 0.693147181f) * ir2) + 1338 ((0.098765432f / 0.693147181f) * ir*ir2) - 1339 ((0.049382716f / 0.693147181f) * ir2*ir2) + 1340 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) - 1341 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2); 1342 return (float)(e - 127) + adj2; 1343} 1344extern float2 __attribute__((overloadable)) native_log2(float2 v) { 1345 float2 v2 = {native_log2(v.x), native_log2(v.y)}; 1346 return v2; 1347} 1348extern float3 __attribute__((overloadable)) native_log2(float3 v) { 1349 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)}; 1350 return v2; 1351} 1352extern float4 __attribute__((overloadable)) native_log2(float4 v) { 1353 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)}; 1354 return v2; 1355} 1356 1357extern float __attribute__((overloadable)) native_log(float v) { 1358 return native_log2(v) * (1.f / 1.442695041f); 1359} 1360extern float2 __attribute__((overloadable)) native_log(float2 v) { 1361 return native_log2(v) * (1.f / 1.442695041f); 1362} 1363extern float3 __attribute__((overloadable)) native_log(float3 v) { 1364 return native_log2(v) * (1.f / 1.442695041f); 1365} 1366extern float4 __attribute__((overloadable)) native_log(float4 v) { 1367 return native_log2(v) * (1.f / 1.442695041f); 1368} 1369 1370extern float __attribute__((overloadable)) native_log10(float v) { 1371 return native_log2(v) * (1.f / 3.321928095f); 1372} 1373extern float2 __attribute__((overloadable)) native_log10(float2 v) { 1374 return native_log2(v) * (1.f / 3.321928095f); 1375} 1376extern float3 __attribute__((overloadable)) native_log10(float3 v) { 1377 return native_log2(v) * (1.f / 3.321928095f); 1378} 1379extern float4 __attribute__((overloadable)) native_log10(float4 v) { 1380 return native_log2(v) * (1.f / 3.321928095f); 1381} 1382 1383 1384extern float __attribute__((overloadable)) native_powr(float v, float y) { 1385 float v2 = native_log2(v); 1386 v2 = fmax(v2 * y, -125.f); 1387 return native_exp2(v2); 1388} 1389extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) { 1390 float2 v2 = native_log2(v); 1391 v2 = fmax(v2 * y, -125.f); 1392 return native_exp2(v2); 1393} 1394extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) { 1395 float3 v2 = native_log2(v); 1396 v2 = fmax(v2 * y, -125.f); 1397 return native_exp2(v2); 1398} 1399extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) { 1400 float4 v2 = native_log2(v); 1401 v2 = fmax(v2 * y, -125.f); 1402 return native_exp2(v2); 1403} 1404 1405extern double __attribute__((overloadable)) min(double v1, double v2) { 1406 return v1 < v2 ? v1 : v2; 1407} 1408 1409extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) { 1410 double2 r; 1411 r.x = v1.x < v2.x ? v1.x : v2.x; 1412 r.y = v1.y < v2.y ? v1.y : v2.y; 1413 return r; 1414} 1415 1416extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) { 1417 double3 r; 1418 r.x = v1.x < v2.x ? v1.x : v2.x; 1419 r.y = v1.y < v2.y ? v1.y : v2.y; 1420 r.z = v1.z < v2.z ? v1.z : v2.z; 1421 return r; 1422} 1423 1424extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) { 1425 double4 r; 1426 r.x = v1.x < v2.x ? v1.x : v2.x; 1427 r.y = v1.y < v2.y ? v1.y : v2.y; 1428 r.z = v1.z < v2.z ? v1.z : v2.z; 1429 r.w = v1.w < v2.w ? v1.w : v2.w; 1430 return r; 1431} 1432 1433extern long __attribute__((overloadable)) min(long v1, long v2) { 1434 return v1 < v2 ? v1 : v2; 1435} 1436extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) { 1437 long2 r; 1438 r.x = v1.x < v2.x ? v1.x : v2.x; 1439 r.y = v1.y < v2.y ? v1.y : v2.y; 1440 return r; 1441} 1442extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) { 1443 long3 r; 1444 r.x = v1.x < v2.x ? v1.x : v2.x; 1445 r.y = v1.y < v2.y ? v1.y : v2.y; 1446 r.z = v1.z < v2.z ? v1.z : v2.z; 1447 return r; 1448} 1449extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) { 1450 long4 r; 1451 r.x = v1.x < v2.x ? v1.x : v2.x; 1452 r.y = v1.y < v2.y ? v1.y : v2.y; 1453 r.z = v1.z < v2.z ? v1.z : v2.z; 1454 r.w = v1.w < v2.w ? v1.w : v2.w; 1455 return r; 1456} 1457 1458extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) { 1459 return v1 < v2 ? v1 : v2; 1460} 1461extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) { 1462 ulong2 r; 1463 r.x = v1.x < v2.x ? v1.x : v2.x; 1464 r.y = v1.y < v2.y ? v1.y : v2.y; 1465 return r; 1466} 1467extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) { 1468 ulong3 r; 1469 r.x = v1.x < v2.x ? v1.x : v2.x; 1470 r.y = v1.y < v2.y ? v1.y : v2.y; 1471 r.z = v1.z < v2.z ? v1.z : v2.z; 1472 return r; 1473} 1474extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) { 1475 ulong4 r; 1476 r.x = v1.x < v2.x ? v1.x : v2.x; 1477 r.y = v1.y < v2.y ? v1.y : v2.y; 1478 r.z = v1.z < v2.z ? v1.z : v2.z; 1479 r.w = v1.w < v2.w ? v1.w : v2.w; 1480 return r; 1481} 1482 1483extern double __attribute__((overloadable)) max(double v1, double v2) { 1484 return v1 > v2 ? v1 : v2; 1485} 1486 1487extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) { 1488 double2 r; 1489 r.x = v1.x > v2.x ? v1.x : v2.x; 1490 r.y = v1.y > v2.y ? v1.y : v2.y; 1491 return r; 1492} 1493 1494extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) { 1495 double3 r; 1496 r.x = v1.x > v2.x ? v1.x : v2.x; 1497 r.y = v1.y > v2.y ? v1.y : v2.y; 1498 r.z = v1.z > v2.z ? v1.z : v2.z; 1499 return r; 1500} 1501 1502extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) { 1503 double4 r; 1504 r.x = v1.x > v2.x ? v1.x : v2.x; 1505 r.y = v1.y > v2.y ? v1.y : v2.y; 1506 r.z = v1.z > v2.z ? v1.z : v2.z; 1507 r.w = v1.w > v2.w ? v1.w : v2.w; 1508 return r; 1509} 1510 1511extern long __attribute__((overloadable)) max(long v1, long v2) { 1512 return v1 > v2 ? v1 : v2; 1513} 1514extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) { 1515 long2 r; 1516 r.x = v1.x > v2.x ? v1.x : v2.x; 1517 r.y = v1.y > v2.y ? v1.y : v2.y; 1518 return r; 1519} 1520extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) { 1521 long3 r; 1522 r.x = v1.x > v2.x ? v1.x : v2.x; 1523 r.y = v1.y > v2.y ? v1.y : v2.y; 1524 r.z = v1.z > v2.z ? v1.z : v2.z; 1525 return r; 1526} 1527extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) { 1528 long4 r; 1529 r.x = v1.x > v2.x ? v1.x : v2.x; 1530 r.y = v1.y > v2.y ? v1.y : v2.y; 1531 r.z = v1.z > v2.z ? v1.z : v2.z; 1532 r.w = v1.w > v2.w ? v1.w : v2.w; 1533 return r; 1534} 1535 1536extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) { 1537 return v1 > v2 ? v1 : v2; 1538} 1539extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) { 1540 ulong2 r; 1541 r.x = v1.x > v2.x ? v1.x : v2.x; 1542 r.y = v1.y > v2.y ? v1.y : v2.y; 1543 return r; 1544} 1545extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) { 1546 ulong3 r; 1547 r.x = v1.x > v2.x ? v1.x : v2.x; 1548 r.y = v1.y > v2.y ? v1.y : v2.y; 1549 r.z = v1.z > v2.z ? v1.z : v2.z; 1550 return r; 1551} 1552extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) { 1553 ulong4 r; 1554 r.x = v1.x > v2.x ? v1.x : v2.x; 1555 r.y = v1.y > v2.y ? v1.y : v2.y; 1556 r.z = v1.z > v2.z ? v1.z : v2.z; 1557 r.w = v1.w > v2.w ? v1.w : v2.w; 1558 return r; 1559} 1560 1561#define THUNK_NATIVE_F(fn) \ 1562 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \ 1563 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \ 1564 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \ 1565 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);} 1566 1567#define THUNK_NATIVE_F_F(fn) \ 1568 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \ 1569 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \ 1570 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \ 1571 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);} 1572 1573#define THUNK_NATIVE_F_FP(fn) \ 1574 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \ 1575 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \ 1576 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \ 1577 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);} 1578 1579#define THUNK_NATIVE_F_I(fn) \ 1580 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \ 1581 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \ 1582 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \ 1583 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);} 1584 1585THUNK_NATIVE_F(acos) 1586THUNK_NATIVE_F(acosh) 1587THUNK_NATIVE_F(acospi) 1588THUNK_NATIVE_F(asin) 1589THUNK_NATIVE_F(asinh) 1590THUNK_NATIVE_F(asinpi) 1591THUNK_NATIVE_F(atan) 1592THUNK_NATIVE_F_F(atan2) 1593THUNK_NATIVE_F(atanh) 1594THUNK_NATIVE_F(atanpi) 1595THUNK_NATIVE_F_F(atan2pi) 1596THUNK_NATIVE_F(cbrt) 1597THUNK_NATIVE_F(cos) 1598THUNK_NATIVE_F(cosh) 1599THUNK_NATIVE_F(cospi) 1600THUNK_NATIVE_F(expm1) 1601THUNK_NATIVE_F_F(hypot) 1602THUNK_NATIVE_F(log1p) 1603THUNK_NATIVE_F_I(rootn) 1604THUNK_NATIVE_F(rsqrt) 1605THUNK_NATIVE_F(sqrt) 1606THUNK_NATIVE_F(sin) 1607THUNK_NATIVE_F_FP(sincos) 1608THUNK_NATIVE_F(sinh) 1609THUNK_NATIVE_F(sinpi) 1610THUNK_NATIVE_F(tan) 1611THUNK_NATIVE_F(tanh) 1612THUNK_NATIVE_F(tanpi) 1613 1614#undef THUNK_NATIVE_F 1615#undef THUNK_NATIVE_F_F 1616#undef THUNK_NATIVE_F_I 1617#undef THUNK_NATIVE_F_FP 1618 1619float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);} 1620float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);} 1621float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);} 1622float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);} 1623 1624float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);} 1625float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);} 1626float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);} 1627float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);} 1628 1629float __attribute__((overloadable)) native_length(float v) { return fast_length(v);} 1630float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);} 1631float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);} 1632float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);} 1633 1634float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;} 1635float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;} 1636float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;} 1637float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;} 1638 1639float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;} 1640float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;} 1641float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;} 1642float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;} 1643 1644 1645 1646 1647 1648#undef FN_FUNC_FN 1649#undef IN_FUNC_FN 1650#undef FN_FUNC_FN_FN 1651#undef FN_FUNC_FN_F 1652#undef FN_FUNC_FN_IN 1653#undef FN_FUNC_FN_I 1654#undef FN_FUNC_FN_PFN 1655#undef FN_FUNC_FN_PIN 1656#undef FN_FUNC_FN_FN_FN 1657#undef FN_FUNC_FN_FN_PIN 1658#undef XN_FUNC_YN 1659#undef UIN_FUNC_IN 1660#undef IN_FUNC_IN 1661#undef XN_FUNC_XN_XN_BODY 1662#undef IN_FUNC_IN_IN_BODY 1663 1664static const unsigned short kHalfPositiveInfinity = 0x7c00; 1665 1666/* Define f16 functions of the form 1667 * HN output = fn(HN input) 1668 * where HN is scalar or vector half type 1669 */ 1670#define HN_FUNC_HN(fn) \ 1671extern half __attribute__((overloadable)) fn(half h) { \ 1672 return (half) fn((float) h); \ 1673} \ 1674extern half2 __attribute__((overloadable)) fn(half2 v) { \ 1675 return convert_half2(fn(convert_float2(v))); \ 1676} \ 1677extern half3 __attribute__((overloadable)) fn(half3 v) { \ 1678 return convert_half3(fn(convert_float3(v))); \ 1679} \ 1680extern half4 __attribute__((overloadable)) fn(half4 v) { \ 1681 return convert_half4(fn(convert_float4(v))); \ 1682} 1683 1684/* Define f16 functions of the form 1685 * HN output = fn(HN input1, HN input2) 1686 * where HN is scalar or vector half type 1687 */ 1688#define HN_FUNC_HN_HN(fn) \ 1689extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1690 return (half) fn((float) h1, (float) h2); \ 1691} \ 1692extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1693 return convert_half2(fn(convert_float2(v1), \ 1694 convert_float2(v2))); \ 1695} \ 1696extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1697 return convert_half3(fn(convert_float3(v1), \ 1698 convert_float3(v2))); \ 1699} \ 1700extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1701 return convert_half4(fn(convert_float4(v1), \ 1702 convert_float4(v2))); \ 1703} 1704 1705/* Define f16 functions of the form 1706 * HN output = fn(HN input1, half input2) 1707 * where HN is scalar or vector half type 1708 */ 1709#define HN_FUNC_HN_H(fn) \ 1710extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \ 1711 return convert_half2(fn(convert_float2(v1), (float) v2)); \ 1712} \ 1713extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \ 1714 return convert_half3(fn(convert_float3(v1), (float) v2)); \ 1715} \ 1716extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \ 1717 return convert_half4(fn(convert_float4(v1), (float) v2)); \ 1718} 1719 1720/* Define f16 functions of the form 1721 * HN output = fn(HN input1, HN input2, HN input3) 1722 * where HN is scalar or vector half type 1723 */ 1724#define HN_FUNC_HN_HN_HN(fn) \ 1725extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \ 1726 return (half) fn((float) h1, (float) h2, (float) h3); \ 1727} \ 1728extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \ 1729 return convert_half2(fn(convert_float2(v1), \ 1730 convert_float2(v2), \ 1731 convert_float2(v3))); \ 1732} \ 1733extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \ 1734 return convert_half3(fn(convert_float3(v1), \ 1735 convert_float3(v2), \ 1736 convert_float3(v3))); \ 1737} \ 1738extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \ 1739 return convert_half4(fn(convert_float4(v1), \ 1740 convert_float4(v2), \ 1741 convert_float4(v3))); \ 1742} 1743 1744/* Define f16 functions of the form 1745 * HN output = fn(HN input1, IN input2) 1746 * where HN is scalar or vector half type and IN the equivalent integer type 1747 * of same vector length. 1748 */ 1749#define HN_FUNC_HN_IN(fn) \ 1750extern half __attribute__((overloadable)) fn(half h1, int v) { \ 1751 return (half) fn((float) h1, v); \ 1752} \ 1753extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \ 1754 return convert_half2(fn(convert_float2(v1), v2)); \ 1755} \ 1756extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \ 1757 return convert_half3(fn(convert_float3(v1), v2)); \ 1758} \ 1759extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \ 1760 return convert_half4(fn(convert_float4(v1), v2)); \ 1761} 1762 1763/* Define f16 functions of the form 1764 * half output = fn(HN input1) 1765 * where HN is a scalar or vector half type. 1766 */ 1767#define H_FUNC_HN(fn) \ 1768extern half __attribute__((overloadable)) fn(half h) { \ 1769 return (half) fn((float) h); \ 1770} \ 1771extern half __attribute__((overloadable)) fn(half2 v) { \ 1772 return fn(convert_float2(v)); \ 1773} \ 1774extern half __attribute__((overloadable)) fn(half3 v) { \ 1775 return fn(convert_float3(v)); \ 1776} \ 1777extern half __attribute__((overloadable)) fn(half4 v) { \ 1778 return fn(convert_float4(v)); \ 1779} 1780 1781/* Define f16 functions of the form 1782 * half output = fn(HN input1, HN input2) 1783 * where HN is a scalar or vector half type. 1784 */ 1785#define H_FUNC_HN_HN(fn) \ 1786extern half __attribute__((overloadable)) fn(half h1, half h2) { \ 1787 return (half) fn((float) h1, (float) h2); \ 1788} \ 1789extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1790 return fn(convert_float2(v1), convert_float2(v2)); \ 1791} \ 1792extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1793 return fn(convert_float3(v1), convert_float3(v2)); \ 1794} \ 1795extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1796 return fn(convert_float4(v1), convert_float4(v2)); \ 1797} 1798 1799/* Define f16 functions of the form 1800 * HN output = fn(HN input1, HN input2) 1801 * where HN is a vector half type. The functions are defined to call the 1802 * scalar function of the same name. 1803 */ 1804#define SCALARIZE_HN_FUNC_HN_HN(fn) \ 1805extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \ 1806 half2 ret; \ 1807 ret.x = fn(v1.x, v2.x); \ 1808 ret.y = fn(v1.y, v2.y); \ 1809 return ret; \ 1810} \ 1811extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \ 1812 half3 ret; \ 1813 ret.x = fn(v1.x, v2.x); \ 1814 ret.y = fn(v1.y, v2.y); \ 1815 ret.z = fn(v1.z, v2.z); \ 1816 return ret; \ 1817} \ 1818extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \ 1819 half4 ret; \ 1820 ret.x = fn(v1.x, v2.x); \ 1821 ret.y = fn(v1.y, v2.y); \ 1822 ret.z = fn(v1.z, v2.z); \ 1823 ret.w = fn(v1.w, v2.w); \ 1824 return ret; \ 1825} \ 1826 1827HN_FUNC_HN(acos); 1828HN_FUNC_HN(acosh); 1829HN_FUNC_HN(acospi); 1830HN_FUNC_HN(asin); 1831HN_FUNC_HN(asinh); 1832HN_FUNC_HN(asinpi); 1833HN_FUNC_HN(atan); 1834HN_FUNC_HN(atanh); 1835HN_FUNC_HN(atanpi); 1836HN_FUNC_HN_HN(atan2); 1837HN_FUNC_HN_HN(atan2pi); 1838 1839HN_FUNC_HN(cbrt); 1840HN_FUNC_HN(ceil); 1841 1842extern half __attribute__((overloadable)) copysign(half x, half y); 1843SCALARIZE_HN_FUNC_HN_HN(copysign); 1844 1845HN_FUNC_HN(cos); 1846HN_FUNC_HN(cosh); 1847HN_FUNC_HN(cospi); 1848 1849extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) { 1850 half3 r; 1851 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1852 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1853 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1854 return r; 1855} 1856 1857extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) { 1858 half4 r; 1859 r.x = lhs.y * rhs.z - lhs.z * rhs.y; 1860 r.y = lhs.z * rhs.x - lhs.x * rhs.z; 1861 r.z = lhs.x * rhs.y - lhs.y * rhs.x; 1862 r.w = 0.f; 1863 return r; 1864} 1865 1866HN_FUNC_HN(degrees); 1867H_FUNC_HN_HN(distance); 1868H_FUNC_HN_HN(dot); 1869 1870HN_FUNC_HN(erf); 1871HN_FUNC_HN(erfc); 1872HN_FUNC_HN(exp); 1873HN_FUNC_HN(exp10); 1874HN_FUNC_HN(exp2); 1875HN_FUNC_HN(expm1); 1876 1877HN_FUNC_HN(fabs); 1878HN_FUNC_HN_HN(fdim); 1879HN_FUNC_HN(floor); 1880HN_FUNC_HN_HN_HN(fma); 1881HN_FUNC_HN_HN(fmax); 1882HN_FUNC_HN_H(fmax); 1883HN_FUNC_HN_HN(fmin); 1884HN_FUNC_HN_H(fmin); 1885HN_FUNC_HN_HN(fmod); 1886 1887// TODO Add (both variants) of fract 1888// TODO Add frexp 1889 1890HN_FUNC_HN_HN(hypot); 1891 1892extern int __attribute__((overloadable)) ilogb(half x); 1893 1894extern int2 __attribute__((overloadable)) ilogb(half2 v) { 1895 int2 ret; 1896 ret.x = ilogb(v.x); 1897 ret.y = ilogb(v.y); 1898 return ret; 1899} 1900extern int3 __attribute__((overloadable)) ilogb(half3 v) { 1901 int3 ret; 1902 ret.x = ilogb(v.x); 1903 ret.y = ilogb(v.y); 1904 ret.z = ilogb(v.z); 1905 return ret; 1906} 1907extern int4 __attribute__((overloadable)) ilogb(half4 v) { 1908 int4 ret; 1909 ret.x = ilogb(v.x); 1910 ret.y = ilogb(v.y); 1911 ret.z = ilogb(v.z); 1912 ret.w = ilogb(v.w); 1913 return ret; 1914} 1915 1916HN_FUNC_HN_IN(ldexp); 1917extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) { 1918 return convert_half2(ldexp(convert_float2(v), exponent)); 1919} 1920extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) { 1921 return convert_half3(ldexp(convert_float3(v), exponent)); 1922} 1923extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) { 1924 return convert_half4(ldexp(convert_float4(v), exponent)); 1925} 1926 1927H_FUNC_HN(length); 1928HN_FUNC_HN(lgamma); 1929 1930extern half __attribute__((overloadable)) lgamma(half h, int *signp) { 1931 return (half) lgamma((float) h, signp); 1932} 1933extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) { 1934 return convert_half2(lgamma(convert_float2(v), signp)); 1935} 1936extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) { 1937 return convert_half3(lgamma(convert_float3(v), signp)); 1938} 1939extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) { 1940 return convert_half4(lgamma(convert_float4(v), signp)); 1941} 1942 1943HN_FUNC_HN(log); 1944HN_FUNC_HN(log10); 1945HN_FUNC_HN(log1p); 1946HN_FUNC_HN(log2); 1947HN_FUNC_HN(logb); 1948 1949HN_FUNC_HN_HN_HN(mad); 1950HN_FUNC_HN_HN(max); 1951HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff? 1952HN_FUNC_HN_HN(min); 1953HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff? 1954 1955extern half __attribute__((overloadable)) mix(half start, half stop, half amount) { 1956 return start + (stop - start) * amount; 1957} 1958extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) { 1959 return start + (stop - start) * amount; 1960} 1961extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) { 1962 return start + (stop - start) * amount; 1963} 1964extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) { 1965 return start + (stop - start) * amount; 1966} 1967extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) { 1968 return start + (stop - start) * amount; 1969} 1970extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) { 1971 return start + (stop - start) * amount; 1972} 1973extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) { 1974 return start + (stop - start) * amount; 1975} 1976 1977// TODO Define modf. Does it make sense to delegate to the float? 1978 1979half __attribute__((overloadable)) nan_half() { 1980 unsigned short nan_short = kHalfPositiveInfinity | 0x0200; 1981 half nan; 1982 SET_HALF_WORD(nan, nan_short); 1983 return nan; 1984} 1985 1986HN_FUNC_HN(normalize); 1987 1988extern half __attribute__((overloadable)) nextafter(half x, half y); 1989SCALARIZE_HN_FUNC_HN_HN(nextafter); 1990 1991HN_FUNC_HN_HN(pow); 1992HN_FUNC_HN_IN(pown); 1993HN_FUNC_HN_HN(powr); 1994HN_FUNC_HN(radians); 1995HN_FUNC_HN_HN(remainder); 1996 1997extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) { 1998 return (float) remquo((float) n, (float) d, quo); 1999} 2000extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) { 2001 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo)); 2002} 2003extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) { 2004 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo)); 2005} 2006extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) { 2007 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo)); 2008} 2009 2010HN_FUNC_HN(rint); 2011HN_FUNC_HN_IN(rootn); 2012HN_FUNC_HN(round); 2013HN_FUNC_HN(rsqrt); 2014 2015extern half __attribute__((overloadable)) sign(half h) { 2016 if (h > 0) return (half) 1.f; 2017 if (h < 0) return (half) -1.f; 2018 return h; 2019} 2020extern half2 __attribute__((overloadable)) sign(half2 v) { 2021 half2 ret; 2022 ret.x = sign(v.x); 2023 ret.y = sign(v.y); 2024 return ret; 2025} 2026extern half3 __attribute__((overloadable)) sign(half3 v) { 2027 half3 ret; 2028 ret.x = sign(v.x); 2029 ret.y = sign(v.y); 2030 ret.z = sign(v.z); 2031 return ret; 2032} 2033extern half4 __attribute__((overloadable)) sign(half4 v) { 2034 half4 ret; 2035 ret.x = sign(v.x); 2036 ret.y = sign(v.y); 2037 ret.z = sign(v.z); 2038 ret.w = sign(v.w); 2039 return ret; 2040} 2041 2042HN_FUNC_HN(sin); 2043 2044extern half __attribute__((overloadable)) sincos(half v, half *cosptr) { 2045 *cosptr = cos(v); 2046 return sin(v); 2047} 2048// TODO verify if LLVM eliminates the duplicate convert_float2 2049extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) { 2050 *cosptr = cos(v); 2051 return sin(v); 2052} 2053extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) { 2054 *cosptr = cos(v); 2055 return sin(v); 2056} 2057extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) { 2058 *cosptr = cos(v); 2059 return sin(v); 2060} 2061 2062HN_FUNC_HN(sinh); 2063HN_FUNC_HN(sinpi); 2064HN_FUNC_HN(sqrt); 2065 2066extern half __attribute__((overloadable)) step(half edge, half v) { 2067 return (v < edge) ? 0.f : 1.f; 2068} 2069extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) { 2070 half2 r; 2071 r.x = (v.x < edge.x) ? 0.f : 1.f; 2072 r.y = (v.y < edge.y) ? 0.f : 1.f; 2073 return r; 2074} 2075extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) { 2076 half3 r; 2077 r.x = (v.x < edge.x) ? 0.f : 1.f; 2078 r.y = (v.y < edge.y) ? 0.f : 1.f; 2079 r.z = (v.z < edge.z) ? 0.f : 1.f; 2080 return r; 2081} 2082extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) { 2083 half4 r; 2084 r.x = (v.x < edge.x) ? 0.f : 1.f; 2085 r.y = (v.y < edge.y) ? 0.f : 1.f; 2086 r.z = (v.z < edge.z) ? 0.f : 1.f; 2087 r.w = (v.w < edge.w) ? 0.f : 1.f; 2088 return r; 2089} 2090extern half2 __attribute__((overloadable)) step(half2 edge, half v) { 2091 half2 r; 2092 r.x = (v < edge.x) ? 0.f : 1.f; 2093 r.y = (v < edge.y) ? 0.f : 1.f; 2094 return r; 2095} 2096extern half3 __attribute__((overloadable)) step(half3 edge, half v) { 2097 half3 r; 2098 r.x = (v < edge.x) ? 0.f : 1.f; 2099 r.y = (v < edge.y) ? 0.f : 1.f; 2100 r.z = (v < edge.z) ? 0.f : 1.f; 2101 return r; 2102} 2103extern half4 __attribute__((overloadable)) step(half4 edge, half v) { 2104 half4 r; 2105 r.x = (v < edge.x) ? 0.f : 1.f; 2106 r.y = (v < edge.y) ? 0.f : 1.f; 2107 r.z = (v < edge.z) ? 0.f : 1.f; 2108 r.w = (v < edge.w) ? 0.f : 1.f; 2109 return r; 2110} 2111extern half2 __attribute__((overloadable)) step(half edge, half2 v) { 2112 half2 r; 2113 r.x = (v.x < edge) ? 0.f : 1.f; 2114 r.y = (v.y < edge) ? 0.f : 1.f; 2115 return r; 2116} 2117extern half3 __attribute__((overloadable)) step(half edge, half3 v) { 2118 half3 r; 2119 r.x = (v.x < edge) ? 0.f : 1.f; 2120 r.y = (v.y < edge) ? 0.f : 1.f; 2121 r.z = (v.z < edge) ? 0.f : 1.f; 2122 return r; 2123} 2124extern half4 __attribute__((overloadable)) step(half edge, half4 v) { 2125 half4 r; 2126 r.x = (v.x < edge) ? 0.f : 1.f; 2127 r.y = (v.y < edge) ? 0.f : 1.f; 2128 r.z = (v.z < edge) ? 0.f : 1.f; 2129 r.w = (v.w < edge) ? 0.f : 1.f; 2130 return r; 2131} 2132 2133HN_FUNC_HN(tan); 2134HN_FUNC_HN(tanh); 2135HN_FUNC_HN(tanpi); 2136HN_FUNC_HN(tgamma); 2137HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation? 2138 2139HN_FUNC_HN(native_acos); 2140HN_FUNC_HN(native_acosh); 2141HN_FUNC_HN(native_acospi); 2142HN_FUNC_HN(native_asin); 2143HN_FUNC_HN(native_asinh); 2144HN_FUNC_HN(native_asinpi); 2145HN_FUNC_HN(native_atan); 2146HN_FUNC_HN(native_atanh); 2147HN_FUNC_HN(native_atanpi); 2148HN_FUNC_HN_HN(native_atan2); 2149HN_FUNC_HN_HN(native_atan2pi); 2150 2151HN_FUNC_HN(native_cbrt); 2152HN_FUNC_HN(native_cos); 2153HN_FUNC_HN(native_cosh); 2154HN_FUNC_HN(native_cospi); 2155 2156H_FUNC_HN_HN(native_distance); 2157HN_FUNC_HN_HN(native_divide); 2158 2159HN_FUNC_HN(native_exp); 2160HN_FUNC_HN(native_exp10); 2161HN_FUNC_HN(native_exp2); 2162HN_FUNC_HN(native_expm1); 2163 2164HN_FUNC_HN_HN(native_hypot); 2165H_FUNC_HN(native_length); 2166 2167HN_FUNC_HN(native_log); 2168HN_FUNC_HN(native_log10); 2169HN_FUNC_HN(native_log1p); 2170HN_FUNC_HN(native_log2); 2171 2172HN_FUNC_HN(native_normalize); 2173 2174HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half? 2175 2176HN_FUNC_HN(native_recip); 2177HN_FUNC_HN_IN(native_rootn); 2178HN_FUNC_HN(native_rsqrt); 2179 2180HN_FUNC_HN(native_sin); 2181 2182extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) { 2183 return sincos(v, cosptr); 2184} 2185extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) { 2186 return sincos(v, cosptr); 2187} 2188extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) { 2189 return sincos(v, cosptr); 2190} 2191extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) { 2192 return sincos(v, cosptr); 2193} 2194 2195HN_FUNC_HN(native_sinh); 2196HN_FUNC_HN(native_sinpi); 2197HN_FUNC_HN(native_sqrt); 2198 2199HN_FUNC_HN(native_tan); 2200HN_FUNC_HN(native_tanh); 2201HN_FUNC_HN(native_tanpi); 2202 2203#undef HN_FUNC_HN 2204#undef HN_FUNC_HN_HN 2205#undef HN_FUNC_HN_H 2206#undef HN_FUNC_HN_HN_HN 2207#undef HN_FUNC_HN_IN 2208#undef H_FUNC_HN 2209#undef H_FUNC_HN_HN 2210#undef SCALARIZE_HN_FUNC_HN_HN 2211 2212// exports unavailable mathlib functions to compat lib 2213 2214#ifdef RS_COMPATIBILITY_LIB 2215 2216// !!! DANGER !!! 2217// These functions are potentially missing on older Android versions. 2218// Work around the issue by supplying our own variants. 2219// !!! DANGER !!! 2220 2221// The logbl() implementation is taken from the latest bionic/, since 2222// double == long double on Android. 2223extern "C" long double logbl(long double x) { return logb(x); } 2224 2225// __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just 2226// pick the simplest implementation based on the ARM EABI doc. 2227extern "C" int __aeabi_idiv0(int v) { return v; } 2228 2229#endif // compatibility lib 2230