1/* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9#include <stdlib.h> 10#include <stdio.h> 11 12#include "cpu.h" 13#include "exec/exec-all.h" 14#include "helper.h" 15 16#define SIGNBIT (uint32_t)0x80000000 17#define SIGNBIT64 ((uint64_t)1 << 63) 18 19#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 20 21#define NEON_TYPE1(name, type) \ 22typedef struct \ 23{ \ 24 type v1; \ 25} neon_##name; 26#ifdef HOST_WORDS_BIGENDIAN 27#define NEON_TYPE2(name, type) \ 28typedef struct \ 29{ \ 30 type v2; \ 31 type v1; \ 32} neon_##name; 33#define NEON_TYPE4(name, type) \ 34typedef struct \ 35{ \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40} neon_##name; 41#else 42#define NEON_TYPE2(name, type) \ 43typedef struct \ 44{ \ 45 type v1; \ 46 type v2; \ 47} neon_##name; 48#define NEON_TYPE4(name, type) \ 49typedef struct \ 50{ \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55} neon_##name; 56#endif 57 58NEON_TYPE4(s8, int8_t) 59NEON_TYPE4(u8, uint8_t) 60NEON_TYPE2(s16, int16_t) 61NEON_TYPE2(u16, uint16_t) 62NEON_TYPE1(s32, int32_t) 63NEON_TYPE1(u32, uint32_t) 64#undef NEON_TYPE4 65#undef NEON_TYPE2 66#undef NEON_TYPE1 67 68/* Copy from a uint32_t to a vector structure type. */ 69#define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78/* Copy from a vector structure type to a uint32_t. */ 79#define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88#define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90#define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93#define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99#define NEON_VOP_BODY(vtype, n) \ 100{ \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110} 111 112#define NEON_VOP(name, vtype, n) \ 113uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114NEON_VOP_BODY(vtype, n) 115 116#define NEON_VOP_ENV(name, vtype, n) \ 117uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118NEON_VOP_BODY(vtype, n) 119 120/* Pairwise operations. */ 121/* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123#define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126#define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132#define NEON_POP(name, vtype, n) \ 133uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134{ \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144} 145 146/* Unary operators. */ 147#define NEON_VOP1(name, vtype, n) \ 148uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149{ \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156} 157 158 159#define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169#undef NEON_FN 170#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172#undef NEON_FN 173#undef NEON_USAT 174 175uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176{ 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183} 184 185uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186{ 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195} 196 197#define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211#undef NEON_FN 212#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214#undef NEON_FN 215#undef NEON_SSAT 216 217uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218{ 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225} 226 227uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228{ 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237} 238 239#define NEON_USAT(dest, src1, src2, type) do { \ 240 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 241 if (tmp != (type)tmp) { \ 242 SET_QC(); \ 243 dest = 0; \ 244 } else { \ 245 dest = tmp; \ 246 }} while(0) 247#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 248NEON_VOP_ENV(qsub_u8, neon_u8, 4) 249#undef NEON_FN 250#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 251NEON_VOP_ENV(qsub_u16, neon_u16, 2) 252#undef NEON_FN 253#undef NEON_USAT 254 255uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 256{ 257 uint32_t res = a - b; 258 if (res > a) { 259 SET_QC(); 260 res = 0; 261 } 262 return res; 263} 264 265uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 266{ 267 uint64_t res; 268 269 if (src1 < src2) { 270 SET_QC(); 271 res = 0; 272 } else { 273 res = src1 - src2; 274 } 275 return res; 276} 277 278#define NEON_SSAT(dest, src1, src2, type) do { \ 279 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 280 if (tmp != (type)tmp) { \ 281 SET_QC(); \ 282 if (src2 < 0) { \ 283 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 284 } else { \ 285 tmp = 1 << (sizeof(type) * 8 - 1); \ 286 } \ 287 } \ 288 dest = tmp; \ 289 } while(0) 290#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 291NEON_VOP_ENV(qsub_s8, neon_s8, 4) 292#undef NEON_FN 293#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 294NEON_VOP_ENV(qsub_s16, neon_s16, 2) 295#undef NEON_FN 296#undef NEON_SSAT 297 298uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 299{ 300 uint32_t res = a - b; 301 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 302 SET_QC(); 303 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 304 } 305 return res; 306} 307 308uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 309{ 310 uint64_t res; 311 312 res = src1 - src2; 313 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 314 SET_QC(); 315 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 316 } 317 return res; 318} 319 320#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 321NEON_VOP(hadd_s8, neon_s8, 4) 322NEON_VOP(hadd_u8, neon_u8, 4) 323NEON_VOP(hadd_s16, neon_s16, 2) 324NEON_VOP(hadd_u16, neon_u16, 2) 325#undef NEON_FN 326 327int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 328{ 329 int32_t dest; 330 331 dest = (src1 >> 1) + (src2 >> 1); 332 if (src1 & src2 & 1) 333 dest++; 334 return dest; 335} 336 337uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 338{ 339 uint32_t dest; 340 341 dest = (src1 >> 1) + (src2 >> 1); 342 if (src1 & src2 & 1) 343 dest++; 344 return dest; 345} 346 347#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 348NEON_VOP(rhadd_s8, neon_s8, 4) 349NEON_VOP(rhadd_u8, neon_u8, 4) 350NEON_VOP(rhadd_s16, neon_s16, 2) 351NEON_VOP(rhadd_u16, neon_u16, 2) 352#undef NEON_FN 353 354int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 355{ 356 int32_t dest; 357 358 dest = (src1 >> 1) + (src2 >> 1); 359 if ((src1 | src2) & 1) 360 dest++; 361 return dest; 362} 363 364uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 365{ 366 uint32_t dest; 367 368 dest = (src1 >> 1) + (src2 >> 1); 369 if ((src1 | src2) & 1) 370 dest++; 371 return dest; 372} 373 374#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 375NEON_VOP(hsub_s8, neon_s8, 4) 376NEON_VOP(hsub_u8, neon_u8, 4) 377NEON_VOP(hsub_s16, neon_s16, 2) 378NEON_VOP(hsub_u16, neon_u16, 2) 379#undef NEON_FN 380 381int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 382{ 383 int32_t dest; 384 385 dest = (src1 >> 1) - (src2 >> 1); 386 if ((~src1) & src2 & 1) 387 dest--; 388 return dest; 389} 390 391uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 392{ 393 uint32_t dest; 394 395 dest = (src1 >> 1) - (src2 >> 1); 396 if ((~src1) & src2 & 1) 397 dest--; 398 return dest; 399} 400 401#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 402NEON_VOP(cgt_s8, neon_s8, 4) 403NEON_VOP(cgt_u8, neon_u8, 4) 404NEON_VOP(cgt_s16, neon_s16, 2) 405NEON_VOP(cgt_u16, neon_u16, 2) 406NEON_VOP(cgt_s32, neon_s32, 1) 407NEON_VOP(cgt_u32, neon_u32, 1) 408#undef NEON_FN 409 410#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 411NEON_VOP(cge_s8, neon_s8, 4) 412NEON_VOP(cge_u8, neon_u8, 4) 413NEON_VOP(cge_s16, neon_s16, 2) 414NEON_VOP(cge_u16, neon_u16, 2) 415NEON_VOP(cge_s32, neon_s32, 1) 416NEON_VOP(cge_u32, neon_u32, 1) 417#undef NEON_FN 418 419#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 420NEON_VOP(min_s8, neon_s8, 4) 421NEON_VOP(min_u8, neon_u8, 4) 422NEON_VOP(min_s16, neon_s16, 2) 423NEON_VOP(min_u16, neon_u16, 2) 424NEON_VOP(min_s32, neon_s32, 1) 425NEON_VOP(min_u32, neon_u32, 1) 426NEON_POP(pmin_s8, neon_s8, 4) 427NEON_POP(pmin_u8, neon_u8, 4) 428NEON_POP(pmin_s16, neon_s16, 2) 429NEON_POP(pmin_u16, neon_u16, 2) 430#undef NEON_FN 431 432#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 433NEON_VOP(max_s8, neon_s8, 4) 434NEON_VOP(max_u8, neon_u8, 4) 435NEON_VOP(max_s16, neon_s16, 2) 436NEON_VOP(max_u16, neon_u16, 2) 437NEON_VOP(max_s32, neon_s32, 1) 438NEON_VOP(max_u32, neon_u32, 1) 439NEON_POP(pmax_s8, neon_s8, 4) 440NEON_POP(pmax_u8, neon_u8, 4) 441NEON_POP(pmax_s16, neon_s16, 2) 442NEON_POP(pmax_u16, neon_u16, 2) 443#undef NEON_FN 444 445#define NEON_FN(dest, src1, src2) \ 446 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) 447NEON_VOP(abd_s8, neon_s8, 4) 448NEON_VOP(abd_u8, neon_u8, 4) 449NEON_VOP(abd_s16, neon_s16, 2) 450NEON_VOP(abd_u16, neon_u16, 2) 451NEON_VOP(abd_s32, neon_s32, 1) 452NEON_VOP(abd_u32, neon_u32, 1) 453#undef NEON_FN 454 455#define NEON_FN(dest, src1, src2) do { \ 456 int8_t tmp; \ 457 tmp = (int8_t)src2; \ 458 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 459 tmp <= -(ssize_t)sizeof(src1) * 8) { \ 460 dest = 0; \ 461 } else if (tmp < 0) { \ 462 dest = src1 >> -tmp; \ 463 } else { \ 464 dest = src1 << tmp; \ 465 }} while (0) 466NEON_VOP(shl_u8, neon_u8, 4) 467NEON_VOP(shl_u16, neon_u16, 2) 468NEON_VOP(shl_u32, neon_u32, 1) 469#undef NEON_FN 470 471uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) 472{ 473 int8_t shift = (int8_t)shiftop; 474 if (shift >= 64 || shift <= -64) { 475 val = 0; 476 } else if (shift < 0) { 477 val >>= -shift; 478 } else { 479 val <<= shift; 480 } 481 return val; 482} 483 484#define NEON_FN(dest, src1, src2) do { \ 485 int8_t tmp; \ 486 tmp = (int8_t)src2; \ 487 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 488 dest = 0; \ 489 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 490 dest = src1 >> (sizeof(src1) * 8 - 1); \ 491 } else if (tmp < 0) { \ 492 dest = src1 >> -tmp; \ 493 } else { \ 494 dest = src1 << tmp; \ 495 }} while (0) 496NEON_VOP(shl_s8, neon_s8, 4) 497NEON_VOP(shl_s16, neon_s16, 2) 498NEON_VOP(shl_s32, neon_s32, 1) 499#undef NEON_FN 500 501uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) 502{ 503 int8_t shift = (int8_t)shiftop; 504 int64_t val = valop; 505 if (shift >= 64) { 506 val = 0; 507 } else if (shift <= -64) { 508 val >>= 63; 509 } else if (shift < 0) { 510 val >>= -shift; 511 } else { 512 val <<= shift; 513 } 514 return val; 515} 516 517#define NEON_FN(dest, src1, src2) do { \ 518 int8_t tmp; \ 519 tmp = (int8_t)src2; \ 520 if ((tmp >= (ssize_t)sizeof(src1) * 8) \ 521 || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \ 522 dest = 0; \ 523 } else if (tmp < 0) { \ 524 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 525 } else { \ 526 dest = src1 << tmp; \ 527 }} while (0) 528NEON_VOP(rshl_s8, neon_s8, 4) 529NEON_VOP(rshl_s16, neon_s16, 2) 530#undef NEON_FN 531 532/* The addition of the rounding constant may overflow, so we use an 533 * intermediate 64 bit accumulator. */ 534uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) 535{ 536 int32_t dest; 537 int32_t val = (int32_t)valop; 538 int8_t shift = (int8_t)shiftop; 539 if ((shift >= 32) || (shift <= -32)) { 540 dest = 0; 541 } else if (shift < 0) { 542 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 543 dest = big_dest >> -shift; 544 } else { 545 dest = val << shift; 546 } 547 return dest; 548} 549 550/* Handling addition overflow with 64 bit input values is more 551 * tricky than with 32 bit values. */ 552uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) 553{ 554 int8_t shift = (int8_t)shiftop; 555 int64_t val = valop; 556 if ((shift >= 64) || (shift <= -64)) { 557 val = 0; 558 } else if (shift < 0) { 559 val >>= (-shift - 1); 560 if (val == INT64_MAX) { 561 /* In this case, it means that the rounding constant is 1, 562 * and the addition would overflow. Return the actual 563 * result directly. */ 564 val = 0x4000000000000000LL; 565 } else { 566 val++; 567 val >>= 1; 568 } 569 } else { 570 val <<= shift; 571 } 572 return val; 573} 574 575#define NEON_FN(dest, src1, src2) do { \ 576 int8_t tmp; \ 577 tmp = (int8_t)src2; \ 578 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 579 tmp < -(ssize_t)sizeof(src1) * 8) { \ 580 dest = 0; \ 581 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 582 dest = src1 >> (-tmp - 1); \ 583 } else if (tmp < 0) { \ 584 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 585 } else { \ 586 dest = src1 << tmp; \ 587 }} while (0) 588NEON_VOP(rshl_u8, neon_u8, 4) 589NEON_VOP(rshl_u16, neon_u16, 2) 590#undef NEON_FN 591 592/* The addition of the rounding constant may overflow, so we use an 593 * intermediate 64 bit accumulator. */ 594uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) 595{ 596 uint32_t dest; 597 int8_t shift = (int8_t)shiftop; 598 if (shift >= 32 || shift < -32) { 599 dest = 0; 600 } else if (shift == -32) { 601 dest = val >> 31; 602 } else if (shift < 0) { 603 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 604 dest = big_dest >> -shift; 605 } else { 606 dest = val << shift; 607 } 608 return dest; 609} 610 611/* Handling addition overflow with 64 bit input values is more 612 * tricky than with 32 bit values. */ 613uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) 614{ 615 int8_t shift = (uint8_t)shiftop; 616 if (shift >= 64 || shift < -64) { 617 val = 0; 618 } else if (shift == -64) { 619 /* Rounding a 1-bit result just preserves that bit. */ 620 val >>= 63; 621 } else if (shift < 0) { 622 val >>= (-shift - 1); 623 if (val == UINT64_MAX) { 624 /* In this case, it means that the rounding constant is 1, 625 * and the addition would overflow. Return the actual 626 * result directly. */ 627 val = 0x8000000000000000ULL; 628 } else { 629 val++; 630 val >>= 1; 631 } 632 } else { 633 val <<= shift; 634 } 635 return val; 636} 637 638#define NEON_FN(dest, src1, src2) do { \ 639 int8_t tmp; \ 640 tmp = (int8_t)src2; \ 641 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 642 if (src1) { \ 643 SET_QC(); \ 644 dest = ~0; \ 645 } else { \ 646 dest = 0; \ 647 } \ 648 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 649 dest = 0; \ 650 } else if (tmp < 0) { \ 651 dest = src1 >> -tmp; \ 652 } else { \ 653 dest = src1 << tmp; \ 654 if ((dest >> tmp) != src1) { \ 655 SET_QC(); \ 656 dest = ~0; \ 657 } \ 658 }} while (0) 659NEON_VOP_ENV(qshl_u8, neon_u8, 4) 660NEON_VOP_ENV(qshl_u16, neon_u16, 2) 661NEON_VOP_ENV(qshl_u32, neon_u32, 1) 662#undef NEON_FN 663 664uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) 665{ 666 int8_t shift = (int8_t)shiftop; 667 if (shift >= 64) { 668 if (val) { 669 val = ~(uint64_t)0; 670 SET_QC(); 671 } 672 } else if (shift <= -64) { 673 val = 0; 674 } else if (shift < 0) { 675 val >>= -shift; 676 } else { 677 uint64_t tmp = val; 678 val <<= shift; 679 if ((val >> shift) != tmp) { 680 SET_QC(); 681 val = ~(uint64_t)0; 682 } 683 } 684 return val; 685} 686 687#define NEON_FN(dest, src1, src2) do { \ 688 int8_t tmp; \ 689 tmp = (int8_t)src2; \ 690 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 691 if (src1) { \ 692 SET_QC(); \ 693 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 694 if (src1 > 0) { \ 695 dest--; \ 696 } \ 697 } else { \ 698 dest = src1; \ 699 } \ 700 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 701 dest = src1 >> 31; \ 702 } else if (tmp < 0) { \ 703 dest = src1 >> -tmp; \ 704 } else { \ 705 dest = src1 << tmp; \ 706 if ((dest >> tmp) != src1) { \ 707 SET_QC(); \ 708 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 709 if (src1 > 0) { \ 710 dest--; \ 711 } \ 712 } \ 713 }} while (0) 714NEON_VOP_ENV(qshl_s8, neon_s8, 4) 715NEON_VOP_ENV(qshl_s16, neon_s16, 2) 716NEON_VOP_ENV(qshl_s32, neon_s32, 1) 717#undef NEON_FN 718 719uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 720{ 721 int8_t shift = (uint8_t)shiftop; 722 int64_t val = valop; 723 if (shift >= 64) { 724 if (val) { 725 SET_QC(); 726 val = (val >> 63) ^ ~SIGNBIT64; 727 } 728 } else if (shift <= -64) { 729 val >>= 63; 730 } else if (shift < 0) { 731 val >>= -shift; 732 } else { 733 int64_t tmp = val; 734 val <<= shift; 735 if ((val >> shift) != tmp) { 736 SET_QC(); 737 val = (tmp >> 63) ^ ~SIGNBIT64; 738 } 739 } 740 return val; 741} 742 743#define NEON_FN(dest, src1, src2) do { \ 744 if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \ 745 SET_QC(); \ 746 dest = 0; \ 747 } else { \ 748 int8_t tmp; \ 749 tmp = (int8_t)src2; \ 750 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 751 if (src1) { \ 752 SET_QC(); \ 753 dest = ~0; \ 754 } else { \ 755 dest = 0; \ 756 } \ 757 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 758 dest = 0; \ 759 } else if (tmp < 0) { \ 760 dest = src1 >> -tmp; \ 761 } else { \ 762 dest = src1 << tmp; \ 763 if ((dest >> tmp) != src1) { \ 764 SET_QC(); \ 765 dest = ~0; \ 766 } \ 767 } \ 768 }} while (0) 769NEON_VOP_ENV(qshlu_s8, neon_u8, 4) 770NEON_VOP_ENV(qshlu_s16, neon_u16, 2) 771#undef NEON_FN 772 773uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) 774{ 775 if ((int32_t)valop < 0) { 776 SET_QC(); 777 return 0; 778 } 779 return helper_neon_qshl_u32(env, valop, shiftop); 780} 781 782uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 783{ 784 if ((int64_t)valop < 0) { 785 SET_QC(); 786 return 0; 787 } 788 return helper_neon_qshl_u64(env, valop, shiftop); 789} 790 791#define NEON_FN(dest, src1, src2) do { \ 792 int8_t tmp; \ 793 tmp = (int8_t)src2; \ 794 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 795 if (src1) { \ 796 SET_QC(); \ 797 dest = ~0; \ 798 } else { \ 799 dest = 0; \ 800 } \ 801 } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \ 802 dest = 0; \ 803 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 804 dest = src1 >> (sizeof(src1) * 8 - 1); \ 805 } else if (tmp < 0) { \ 806 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 807 } else { \ 808 dest = src1 << tmp; \ 809 if ((dest >> tmp) != src1) { \ 810 SET_QC(); \ 811 dest = ~0; \ 812 } \ 813 }} while (0) 814NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 815NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 816#undef NEON_FN 817 818/* The addition of the rounding constant may overflow, so we use an 819 * intermediate 64 bit accumulator. */ 820uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop) 821{ 822 uint32_t dest; 823 int8_t shift = (int8_t)shiftop; 824 if (shift >= 32) { 825 if (val) { 826 SET_QC(); 827 dest = ~0; 828 } else { 829 dest = 0; 830 } 831 } else if (shift < -32) { 832 dest = 0; 833 } else if (shift == -32) { 834 dest = val >> 31; 835 } else if (shift < 0) { 836 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 837 dest = big_dest >> -shift; 838 } else { 839 dest = val << shift; 840 if ((dest >> shift) != val) { 841 SET_QC(); 842 dest = ~0; 843 } 844 } 845 return dest; 846} 847 848/* Handling addition overflow with 64 bit input values is more 849 * tricky than with 32 bit values. */ 850uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) 851{ 852 int8_t shift = (int8_t)shiftop; 853 if (shift >= 64) { 854 if (val) { 855 SET_QC(); 856 val = ~0; 857 } 858 } else if (shift < -64) { 859 val = 0; 860 } else if (shift == -64) { 861 val >>= 63; 862 } else if (shift < 0) { 863 val >>= (-shift - 1); 864 if (val == UINT64_MAX) { 865 /* In this case, it means that the rounding constant is 1, 866 * and the addition would overflow. Return the actual 867 * result directly. */ 868 val = 0x8000000000000000ULL; 869 } else { 870 val++; 871 val >>= 1; 872 } 873 } else { \ 874 uint64_t tmp = val; 875 val <<= shift; 876 if ((val >> shift) != tmp) { 877 SET_QC(); 878 val = ~0; 879 } 880 } 881 return val; 882} 883 884#define NEON_FN(dest, src1, src2) do { \ 885 int8_t tmp; \ 886 tmp = (int8_t)src2; \ 887 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 888 if (src1) { \ 889 SET_QC(); \ 890 dest = (1 << (sizeof(src1) * 8 - 1)); \ 891 if (src1 > 0) { \ 892 dest--; \ 893 } \ 894 } else { \ 895 dest = 0; \ 896 } \ 897 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 898 dest = 0; \ 899 } else if (tmp < 0) { \ 900 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 901 } else { \ 902 dest = src1 << tmp; \ 903 if ((dest >> tmp) != src1) { \ 904 SET_QC(); \ 905 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 906 if (src1 > 0) { \ 907 dest--; \ 908 } \ 909 } \ 910 }} while (0) 911NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 912NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 913#undef NEON_FN 914 915/* The addition of the rounding constant may overflow, so we use an 916 * intermediate 64 bit accumulator. */ 917uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) 918{ 919 int32_t dest; 920 int32_t val = (int32_t)valop; 921 int8_t shift = (int8_t)shiftop; 922 if (shift >= 32) { 923 if (val) { 924 SET_QC(); 925 dest = (val >> 31) ^ ~SIGNBIT; 926 } else { 927 dest = 0; 928 } 929 } else if (shift <= -32) { 930 dest = 0; 931 } else if (shift < 0) { 932 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 933 dest = big_dest >> -shift; 934 } else { 935 dest = val << shift; 936 if ((dest >> shift) != val) { 937 SET_QC(); 938 dest = (val >> 31) ^ ~SIGNBIT; 939 } 940 } 941 return dest; 942} 943 944/* Handling addition overflow with 64 bit input values is more 945 * tricky than with 32 bit values. */ 946uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 947{ 948 int8_t shift = (uint8_t)shiftop; 949 int64_t val = valop; 950 951 if (shift >= 64) { 952 if (val) { 953 SET_QC(); 954 val = (val >> 63) ^ ~SIGNBIT64; 955 } 956 } else if (shift <= -64) { 957 val = 0; 958 } else if (shift < 0) { 959 val >>= (-shift - 1); 960 if (val == INT64_MAX) { 961 /* In this case, it means that the rounding constant is 1, 962 * and the addition would overflow. Return the actual 963 * result directly. */ 964 val = 0x4000000000000000ULL; 965 } else { 966 val++; 967 val >>= 1; 968 } 969 } else { 970 int64_t tmp = val; 971 val <<= shift; 972 if ((val >> shift) != tmp) { 973 SET_QC(); 974 val = (tmp >> 63) ^ ~SIGNBIT64; 975 } 976 } 977 return val; 978} 979 980uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 981{ 982 uint32_t mask; 983 mask = (a ^ b) & 0x80808080u; 984 a &= ~0x80808080u; 985 b &= ~0x80808080u; 986 return (a + b) ^ mask; 987} 988 989uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 990{ 991 uint32_t mask; 992 mask = (a ^ b) & 0x80008000u; 993 a &= ~0x80008000u; 994 b &= ~0x80008000u; 995 return (a + b) ^ mask; 996} 997 998#define NEON_FN(dest, src1, src2) dest = src1 + src2 999NEON_POP(padd_u8, neon_u8, 4) 1000NEON_POP(padd_u16, neon_u16, 2) 1001#undef NEON_FN 1002 1003#define NEON_FN(dest, src1, src2) dest = src1 - src2 1004NEON_VOP(sub_u8, neon_u8, 4) 1005NEON_VOP(sub_u16, neon_u16, 2) 1006#undef NEON_FN 1007 1008#define NEON_FN(dest, src1, src2) dest = src1 * src2 1009NEON_VOP(mul_u8, neon_u8, 4) 1010NEON_VOP(mul_u16, neon_u16, 2) 1011#undef NEON_FN 1012 1013/* Polynomial multiplication is like integer multiplication except the 1014 partial products are XORed, not added. */ 1015uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) 1016{ 1017 uint32_t mask; 1018 uint32_t result; 1019 result = 0; 1020 while (op1) { 1021 mask = 0; 1022 if (op1 & 1) 1023 mask |= 0xff; 1024 if (op1 & (1 << 8)) 1025 mask |= (0xff << 8); 1026 if (op1 & (1 << 16)) 1027 mask |= (0xff << 16); 1028 if (op1 & (1 << 24)) 1029 mask |= (0xff << 24); 1030 result ^= op2 & mask; 1031 op1 = (op1 >> 1) & 0x7f7f7f7f; 1032 op2 = (op2 << 1) & 0xfefefefe; 1033 } 1034 return result; 1035} 1036 1037uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) 1038{ 1039 uint64_t result = 0; 1040 uint64_t mask; 1041 uint64_t op2ex = op2; 1042 op2ex = (op2ex & 0xff) | 1043 ((op2ex & 0xff00) << 8) | 1044 ((op2ex & 0xff0000) << 16) | 1045 ((op2ex & 0xff000000) << 24); 1046 while (op1) { 1047 mask = 0; 1048 if (op1 & 1) { 1049 mask |= 0xffff; 1050 } 1051 if (op1 & (1 << 8)) { 1052 mask |= (0xffffU << 16); 1053 } 1054 if (op1 & (1 << 16)) { 1055 mask |= (0xffffULL << 32); 1056 } 1057 if (op1 & (1 << 24)) { 1058 mask |= (0xffffULL << 48); 1059 } 1060 result ^= op2ex & mask; 1061 op1 = (op1 >> 1) & 0x7f7f7f7f; 1062 op2ex <<= 1; 1063 } 1064 return result; 1065} 1066 1067#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 1068NEON_VOP(tst_u8, neon_u8, 4) 1069NEON_VOP(tst_u16, neon_u16, 2) 1070NEON_VOP(tst_u32, neon_u32, 1) 1071#undef NEON_FN 1072 1073#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 1074NEON_VOP(ceq_u8, neon_u8, 4) 1075NEON_VOP(ceq_u16, neon_u16, 2) 1076NEON_VOP(ceq_u32, neon_u32, 1) 1077#undef NEON_FN 1078 1079#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src 1080NEON_VOP1(abs_s8, neon_s8, 4) 1081NEON_VOP1(abs_s16, neon_s16, 2) 1082#undef NEON_FN 1083 1084/* Count Leading Sign/Zero Bits. */ 1085static inline int do_clz8(uint8_t x) 1086{ 1087 int n; 1088 for (n = 8; x; n--) 1089 x >>= 1; 1090 return n; 1091} 1092 1093static inline int do_clz16(uint16_t x) 1094{ 1095 int n; 1096 for (n = 16; x; n--) 1097 x >>= 1; 1098 return n; 1099} 1100 1101#define NEON_FN(dest, src, dummy) dest = do_clz8(src) 1102NEON_VOP1(clz_u8, neon_u8, 4) 1103#undef NEON_FN 1104 1105#define NEON_FN(dest, src, dummy) dest = do_clz16(src) 1106NEON_VOP1(clz_u16, neon_u16, 2) 1107#undef NEON_FN 1108 1109#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 1110NEON_VOP1(cls_s8, neon_s8, 4) 1111#undef NEON_FN 1112 1113#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 1114NEON_VOP1(cls_s16, neon_s16, 2) 1115#undef NEON_FN 1116 1117uint32_t HELPER(neon_cls_s32)(uint32_t x) 1118{ 1119 int count; 1120 if ((int32_t)x < 0) 1121 x = ~x; 1122 for (count = 32; x; count--) 1123 x = x >> 1; 1124 return count - 1; 1125} 1126 1127/* Bit count. */ 1128uint32_t HELPER(neon_cnt_u8)(uint32_t x) 1129{ 1130 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 1131 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 1132 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 1133 return x; 1134} 1135 1136#define NEON_QDMULH16(dest, src1, src2, round) do { \ 1137 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 1138 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 1139 SET_QC(); \ 1140 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 1141 } else { \ 1142 tmp <<= 1; \ 1143 } \ 1144 if (round) { \ 1145 int32_t old = tmp; \ 1146 tmp += 1 << 15; \ 1147 if ((int32_t)tmp < old) { \ 1148 SET_QC(); \ 1149 tmp = SIGNBIT - 1; \ 1150 } \ 1151 } \ 1152 dest = tmp >> 16; \ 1153 } while(0) 1154#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 1155NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 1156#undef NEON_FN 1157#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 1158NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 1159#undef NEON_FN 1160#undef NEON_QDMULH16 1161 1162#define NEON_QDMULH32(dest, src1, src2, round) do { \ 1163 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 1164 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 1165 SET_QC(); \ 1166 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 1167 } else { \ 1168 tmp <<= 1; \ 1169 } \ 1170 if (round) { \ 1171 int64_t old = tmp; \ 1172 tmp += (int64_t)1 << 31; \ 1173 if ((int64_t)tmp < old) { \ 1174 SET_QC(); \ 1175 tmp = SIGNBIT64 - 1; \ 1176 } \ 1177 } \ 1178 dest = tmp >> 32; \ 1179 } while(0) 1180#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 1181NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 1182#undef NEON_FN 1183#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 1184NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 1185#undef NEON_FN 1186#undef NEON_QDMULH32 1187 1188uint32_t HELPER(neon_narrow_u8)(uint64_t x) 1189{ 1190 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 1191 | ((x >> 24) & 0xff000000u); 1192} 1193 1194uint32_t HELPER(neon_narrow_u16)(uint64_t x) 1195{ 1196 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 1197} 1198 1199uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 1200{ 1201 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1202 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1203} 1204 1205uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 1206{ 1207 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1208} 1209 1210uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 1211{ 1212 x &= 0xff80ff80ff80ff80ull; 1213 x += 0x0080008000800080ull; 1214 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1215 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1216} 1217 1218uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 1219{ 1220 x &= 0xffff8000ffff8000ull; 1221 x += 0x0000800000008000ull; 1222 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1223} 1224 1225uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 1226{ 1227 uint16_t s; 1228 uint8_t d; 1229 uint32_t res = 0; 1230#define SAT8(n) \ 1231 s = x >> n; \ 1232 if (s & 0x8000) { \ 1233 SET_QC(); \ 1234 } else { \ 1235 if (s > 0xff) { \ 1236 d = 0xff; \ 1237 SET_QC(); \ 1238 } else { \ 1239 d = s; \ 1240 } \ 1241 res |= (uint32_t)d << (n / 2); \ 1242 } 1243 1244 SAT8(0); 1245 SAT8(16); 1246 SAT8(32); 1247 SAT8(48); 1248#undef SAT8 1249 return res; 1250} 1251 1252uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 1253{ 1254 uint16_t s; 1255 uint8_t d; 1256 uint32_t res = 0; 1257#define SAT8(n) \ 1258 s = x >> n; \ 1259 if (s > 0xff) { \ 1260 d = 0xff; \ 1261 SET_QC(); \ 1262 } else { \ 1263 d = s; \ 1264 } \ 1265 res |= (uint32_t)d << (n / 2); 1266 1267 SAT8(0); 1268 SAT8(16); 1269 SAT8(32); 1270 SAT8(48); 1271#undef SAT8 1272 return res; 1273} 1274 1275uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 1276{ 1277 int16_t s; 1278 uint8_t d; 1279 uint32_t res = 0; 1280#define SAT8(n) \ 1281 s = x >> n; \ 1282 if (s != (int8_t)s) { \ 1283 d = (s >> 15) ^ 0x7f; \ 1284 SET_QC(); \ 1285 } else { \ 1286 d = s; \ 1287 } \ 1288 res |= (uint32_t)d << (n / 2); 1289 1290 SAT8(0); 1291 SAT8(16); 1292 SAT8(32); 1293 SAT8(48); 1294#undef SAT8 1295 return res; 1296} 1297 1298uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 1299{ 1300 uint32_t high; 1301 uint32_t low; 1302 low = x; 1303 if (low & 0x80000000) { 1304 low = 0; 1305 SET_QC(); 1306 } else if (low > 0xffff) { 1307 low = 0xffff; 1308 SET_QC(); 1309 } 1310 high = x >> 32; 1311 if (high & 0x80000000) { 1312 high = 0; 1313 SET_QC(); 1314 } else if (high > 0xffff) { 1315 high = 0xffff; 1316 SET_QC(); 1317 } 1318 return low | (high << 16); 1319} 1320 1321uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 1322{ 1323 uint32_t high; 1324 uint32_t low; 1325 low = x; 1326 if (low > 0xffff) { 1327 low = 0xffff; 1328 SET_QC(); 1329 } 1330 high = x >> 32; 1331 if (high > 0xffff) { 1332 high = 0xffff; 1333 SET_QC(); 1334 } 1335 return low | (high << 16); 1336} 1337 1338uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 1339{ 1340 int32_t low; 1341 int32_t high; 1342 low = x; 1343 if (low != (int16_t)low) { 1344 low = (low >> 31) ^ 0x7fff; 1345 SET_QC(); 1346 } 1347 high = x >> 32; 1348 if (high != (int16_t)high) { 1349 high = (high >> 31) ^ 0x7fff; 1350 SET_QC(); 1351 } 1352 return (uint16_t)low | (high << 16); 1353} 1354 1355uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 1356{ 1357 if (x & 0x8000000000000000ull) { 1358 SET_QC(); 1359 return 0; 1360 } 1361 if (x > 0xffffffffu) { 1362 SET_QC(); 1363 return 0xffffffffu; 1364 } 1365 return x; 1366} 1367 1368uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 1369{ 1370 if (x > 0xffffffffu) { 1371 SET_QC(); 1372 return 0xffffffffu; 1373 } 1374 return x; 1375} 1376 1377uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 1378{ 1379 if ((int64_t)x != (int32_t)x) { 1380 SET_QC(); 1381 return ((int64_t)x >> 63) ^ 0x7fffffff; 1382 } 1383 return x; 1384} 1385 1386uint64_t HELPER(neon_widen_u8)(uint32_t x) 1387{ 1388 uint64_t tmp; 1389 uint64_t ret; 1390 ret = (uint8_t)x; 1391 tmp = (uint8_t)(x >> 8); 1392 ret |= tmp << 16; 1393 tmp = (uint8_t)(x >> 16); 1394 ret |= tmp << 32; 1395 tmp = (uint8_t)(x >> 24); 1396 ret |= tmp << 48; 1397 return ret; 1398} 1399 1400uint64_t HELPER(neon_widen_s8)(uint32_t x) 1401{ 1402 uint64_t tmp; 1403 uint64_t ret; 1404 ret = (uint16_t)(int8_t)x; 1405 tmp = (uint16_t)(int8_t)(x >> 8); 1406 ret |= tmp << 16; 1407 tmp = (uint16_t)(int8_t)(x >> 16); 1408 ret |= tmp << 32; 1409 tmp = (uint16_t)(int8_t)(x >> 24); 1410 ret |= tmp << 48; 1411 return ret; 1412} 1413 1414uint64_t HELPER(neon_widen_u16)(uint32_t x) 1415{ 1416 uint64_t high = (uint16_t)(x >> 16); 1417 return ((uint16_t)x) | (high << 32); 1418} 1419 1420uint64_t HELPER(neon_widen_s16)(uint32_t x) 1421{ 1422 uint64_t high = (int16_t)(x >> 16); 1423 return ((uint32_t)(int16_t)x) | (high << 32); 1424} 1425 1426uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1427{ 1428 uint64_t mask; 1429 mask = (a ^ b) & 0x8000800080008000ull; 1430 a &= ~0x8000800080008000ull; 1431 b &= ~0x8000800080008000ull; 1432 return (a + b) ^ mask; 1433} 1434 1435uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1436{ 1437 uint64_t mask; 1438 mask = (a ^ b) & 0x8000000080000000ull; 1439 a &= ~0x8000000080000000ull; 1440 b &= ~0x8000000080000000ull; 1441 return (a + b) ^ mask; 1442} 1443 1444uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1445{ 1446 uint64_t tmp; 1447 uint64_t tmp2; 1448 1449 tmp = a & 0x0000ffff0000ffffull; 1450 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1451 tmp2 = b & 0xffff0000ffff0000ull; 1452 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1453 return ( tmp & 0xffff) 1454 | ((tmp >> 16) & 0xffff0000ull) 1455 | ((tmp2 << 16) & 0xffff00000000ull) 1456 | ( tmp2 & 0xffff000000000000ull); 1457} 1458 1459uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1460{ 1461 uint32_t low = a + (a >> 32); 1462 uint32_t high = b + (b >> 32); 1463 return low + ((uint64_t)high << 32); 1464} 1465 1466uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1467{ 1468 uint64_t mask; 1469 mask = (a ^ ~b) & 0x8000800080008000ull; 1470 a |= 0x8000800080008000ull; 1471 b &= ~0x8000800080008000ull; 1472 return (a - b) ^ mask; 1473} 1474 1475uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1476{ 1477 uint64_t mask; 1478 mask = (a ^ ~b) & 0x8000000080000000ull; 1479 a |= 0x8000000080000000ull; 1480 b &= ~0x8000000080000000ull; 1481 return (a - b) ^ mask; 1482} 1483 1484uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1485{ 1486 uint32_t x, y; 1487 uint32_t low, high; 1488 1489 x = a; 1490 y = b; 1491 low = x + y; 1492 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1493 SET_QC(); 1494 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1495 } 1496 x = a >> 32; 1497 y = b >> 32; 1498 high = x + y; 1499 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1500 SET_QC(); 1501 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1502 } 1503 return low | ((uint64_t)high << 32); 1504} 1505 1506uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1507{ 1508 uint64_t result; 1509 1510 result = a + b; 1511 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1512 SET_QC(); 1513 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1514 } 1515 return result; 1516} 1517 1518/* We have to do the arithmetic in a larger type than 1519 * the input type, because for example with a signed 32 bit 1520 * op the absolute difference can overflow a signed 32 bit value. 1521 */ 1522#define DO_ABD(dest, x, y, intype, arithtype) do { \ 1523 arithtype tmp_x = (intype)(x); \ 1524 arithtype tmp_y = (intype)(y); \ 1525 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1526 } while(0) 1527 1528uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1529{ 1530 uint64_t tmp; 1531 uint64_t result; 1532 DO_ABD(result, a, b, uint8_t, uint32_t); 1533 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1534 result |= tmp << 16; 1535 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1536 result |= tmp << 32; 1537 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1538 result |= tmp << 48; 1539 return result; 1540} 1541 1542uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1543{ 1544 uint64_t tmp; 1545 uint64_t result; 1546 DO_ABD(result, a, b, int8_t, int32_t); 1547 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1548 result |= tmp << 16; 1549 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1550 result |= tmp << 32; 1551 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1552 result |= tmp << 48; 1553 return result; 1554} 1555 1556uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1557{ 1558 uint64_t tmp; 1559 uint64_t result; 1560 DO_ABD(result, a, b, uint16_t, uint32_t); 1561 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1562 return result | (tmp << 32); 1563} 1564 1565uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1566{ 1567 uint64_t tmp; 1568 uint64_t result; 1569 DO_ABD(result, a, b, int16_t, int32_t); 1570 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1571 return result | (tmp << 32); 1572} 1573 1574uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1575{ 1576 uint64_t result; 1577 DO_ABD(result, a, b, uint32_t, uint64_t); 1578 return result; 1579} 1580 1581uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1582{ 1583 uint64_t result; 1584 DO_ABD(result, a, b, int32_t, int64_t); 1585 return result; 1586} 1587#undef DO_ABD 1588 1589/* Widening multiply. Named type is the source type. */ 1590#define DO_MULL(dest, x, y, type1, type2) do { \ 1591 type1 tmp_x = x; \ 1592 type1 tmp_y = y; \ 1593 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1594 } while(0) 1595 1596uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1597{ 1598 uint64_t tmp; 1599 uint64_t result; 1600 1601 DO_MULL(result, a, b, uint8_t, uint16_t); 1602 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1603 result |= tmp << 16; 1604 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1605 result |= tmp << 32; 1606 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1607 result |= tmp << 48; 1608 return result; 1609} 1610 1611uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1612{ 1613 uint64_t tmp; 1614 uint64_t result; 1615 1616 DO_MULL(result, a, b, int8_t, uint16_t); 1617 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1618 result |= tmp << 16; 1619 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1620 result |= tmp << 32; 1621 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1622 result |= tmp << 48; 1623 return result; 1624} 1625 1626uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1627{ 1628 uint64_t tmp; 1629 uint64_t result; 1630 1631 DO_MULL(result, a, b, uint16_t, uint32_t); 1632 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1633 return result | (tmp << 32); 1634} 1635 1636uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1637{ 1638 uint64_t tmp; 1639 uint64_t result; 1640 1641 DO_MULL(result, a, b, int16_t, uint32_t); 1642 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1643 return result | (tmp << 32); 1644} 1645 1646uint64_t HELPER(neon_negl_u16)(uint64_t x) 1647{ 1648 uint16_t tmp; 1649 uint64_t result; 1650 result = (uint16_t)-x; 1651 tmp = -(x >> 16); 1652 result |= (uint64_t)tmp << 16; 1653 tmp = -(x >> 32); 1654 result |= (uint64_t)tmp << 32; 1655 tmp = -(x >> 48); 1656 result |= (uint64_t)tmp << 48; 1657 return result; 1658} 1659 1660uint64_t HELPER(neon_negl_u32)(uint64_t x) 1661{ 1662 uint32_t low = -x; 1663 uint32_t high = -(x >> 32); 1664 return low | ((uint64_t)high << 32); 1665} 1666 1667/* FIXME: There should be a native op for this. */ 1668uint64_t HELPER(neon_negl_u64)(uint64_t x) 1669{ 1670 return -x; 1671} 1672 1673/* Saturating sign manipulation. */ 1674/* ??? Make these use NEON_VOP1 */ 1675#define DO_QABS8(x) do { \ 1676 if (x == (int8_t)0x80) { \ 1677 x = 0x7f; \ 1678 SET_QC(); \ 1679 } else if (x < 0) { \ 1680 x = -x; \ 1681 }} while (0) 1682uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1683{ 1684 neon_s8 vec; 1685 NEON_UNPACK(neon_s8, vec, x); 1686 DO_QABS8(vec.v1); 1687 DO_QABS8(vec.v2); 1688 DO_QABS8(vec.v3); 1689 DO_QABS8(vec.v4); 1690 NEON_PACK(neon_s8, x, vec); 1691 return x; 1692} 1693#undef DO_QABS8 1694 1695#define DO_QNEG8(x) do { \ 1696 if (x == (int8_t)0x80) { \ 1697 x = 0x7f; \ 1698 SET_QC(); \ 1699 } else { \ 1700 x = -x; \ 1701 }} while (0) 1702uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1703{ 1704 neon_s8 vec; 1705 NEON_UNPACK(neon_s8, vec, x); 1706 DO_QNEG8(vec.v1); 1707 DO_QNEG8(vec.v2); 1708 DO_QNEG8(vec.v3); 1709 DO_QNEG8(vec.v4); 1710 NEON_PACK(neon_s8, x, vec); 1711 return x; 1712} 1713#undef DO_QNEG8 1714 1715#define DO_QABS16(x) do { \ 1716 if (x == (int16_t)0x8000) { \ 1717 x = 0x7fff; \ 1718 SET_QC(); \ 1719 } else if (x < 0) { \ 1720 x = -x; \ 1721 }} while (0) 1722uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1723{ 1724 neon_s16 vec; 1725 NEON_UNPACK(neon_s16, vec, x); 1726 DO_QABS16(vec.v1); 1727 DO_QABS16(vec.v2); 1728 NEON_PACK(neon_s16, x, vec); 1729 return x; 1730} 1731#undef DO_QABS16 1732 1733#define DO_QNEG16(x) do { \ 1734 if (x == (int16_t)0x8000) { \ 1735 x = 0x7fff; \ 1736 SET_QC(); \ 1737 } else { \ 1738 x = -x; \ 1739 }} while (0) 1740uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1741{ 1742 neon_s16 vec; 1743 NEON_UNPACK(neon_s16, vec, x); 1744 DO_QNEG16(vec.v1); 1745 DO_QNEG16(vec.v2); 1746 NEON_PACK(neon_s16, x, vec); 1747 return x; 1748} 1749#undef DO_QNEG16 1750 1751uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1752{ 1753 if (x == SIGNBIT) { 1754 SET_QC(); 1755 x = ~SIGNBIT; 1756 } else if ((int32_t)x < 0) { 1757 x = -x; 1758 } 1759 return x; 1760} 1761 1762uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1763{ 1764 if (x == SIGNBIT) { 1765 SET_QC(); 1766 x = ~SIGNBIT; 1767 } else { 1768 x = -x; 1769 } 1770 return x; 1771} 1772 1773/* NEON Float helpers. */ 1774uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b, void *fpstp) 1775{ 1776 float_status *fpst = fpstp; 1777 return float32_val(float32_min(make_float32(a), make_float32(b), fpst)); 1778} 1779 1780uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b, void *fpstp) 1781{ 1782 float_status *fpst = fpstp; 1783 return float32_val(float32_max(make_float32(a), make_float32(b), fpst)); 1784} 1785 1786uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp) 1787{ 1788 float_status *fpst = fpstp; 1789 float32 f0 = make_float32(a); 1790 float32 f1 = make_float32(b); 1791 return float32_val(float32_abs(float32_sub(f0, f1, fpst))); 1792} 1793 1794/* Floating point comparisons produce an integer result. 1795 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1796 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1797 */ 1798uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1799{ 1800 float_status *fpst = fpstp; 1801 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1802} 1803 1804uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1805{ 1806 float_status *fpst = fpstp; 1807 return -float32_le(make_float32(b), make_float32(a), fpst); 1808} 1809 1810uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1811{ 1812 float_status *fpst = fpstp; 1813 return -float32_lt(make_float32(b), make_float32(a), fpst); 1814} 1815 1816uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1817{ 1818 float_status *fpst = fpstp; 1819 float32 f0 = float32_abs(make_float32(a)); 1820 float32 f1 = float32_abs(make_float32(b)); 1821 return -float32_le(f1, f0, fpst); 1822} 1823 1824uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1825{ 1826 float_status *fpst = fpstp; 1827 float32 f0 = float32_abs(make_float32(a)); 1828 float32 f1 = float32_abs(make_float32(b)); 1829 return -float32_lt(f1, f0, fpst); 1830} 1831 1832#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1833 1834void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1835{ 1836 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1837 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1838 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1839 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1840 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1841 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1842 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1843 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1844 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1845 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1846 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1847 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1848 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1849 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1850 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1851 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1852 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1853 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1854 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1855 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1856 env->vfp.regs[rm] = make_float64(m0); 1857 env->vfp.regs[rm + 1] = make_float64(m1); 1858 env->vfp.regs[rd] = make_float64(d0); 1859 env->vfp.regs[rd + 1] = make_float64(d1); 1860} 1861 1862void HELPER(neon_qunzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1863{ 1864 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1865 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1866 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1867 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1868 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1869 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1870 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1871 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1872 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1873 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1874 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1875 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1876 env->vfp.regs[rm] = make_float64(m0); 1877 env->vfp.regs[rm + 1] = make_float64(m1); 1878 env->vfp.regs[rd] = make_float64(d0); 1879 env->vfp.regs[rd + 1] = make_float64(d1); 1880} 1881 1882void HELPER(neon_qunzip32)(CPUARMState *env, uint32_t rd, uint32_t rm) 1883{ 1884 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1885 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1886 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1887 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1888 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1889 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1890 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1891 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1892 env->vfp.regs[rm] = make_float64(m0); 1893 env->vfp.regs[rm + 1] = make_float64(m1); 1894 env->vfp.regs[rd] = make_float64(d0); 1895 env->vfp.regs[rd + 1] = make_float64(d1); 1896} 1897 1898void HELPER(neon_unzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1899{ 1900 uint64_t zm = float64_val(env->vfp.regs[rm]); 1901 uint64_t zd = float64_val(env->vfp.regs[rd]); 1902 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1903 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1904 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1905 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1906 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1907 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1908 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1909 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1910 env->vfp.regs[rm] = make_float64(m0); 1911 env->vfp.regs[rd] = make_float64(d0); 1912} 1913 1914void HELPER(neon_unzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1915{ 1916 uint64_t zm = float64_val(env->vfp.regs[rm]); 1917 uint64_t zd = float64_val(env->vfp.regs[rd]); 1918 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1919 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1920 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1921 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1922 env->vfp.regs[rm] = make_float64(m0); 1923 env->vfp.regs[rd] = make_float64(d0); 1924} 1925 1926void HELPER(neon_qzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1927{ 1928 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1929 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1930 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1931 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1932 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1933 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1934 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1935 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1936 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1937 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1938 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1939 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1940 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1941 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1942 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1943 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1944 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1945 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1946 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1947 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1948 env->vfp.regs[rm] = make_float64(m0); 1949 env->vfp.regs[rm + 1] = make_float64(m1); 1950 env->vfp.regs[rd] = make_float64(d0); 1951 env->vfp.regs[rd + 1] = make_float64(d1); 1952} 1953 1954void HELPER(neon_qzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1955{ 1956 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1957 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1958 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1959 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1960 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1961 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1962 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1963 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1964 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1965 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1966 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1967 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1968 env->vfp.regs[rm] = make_float64(m0); 1969 env->vfp.regs[rm + 1] = make_float64(m1); 1970 env->vfp.regs[rd] = make_float64(d0); 1971 env->vfp.regs[rd + 1] = make_float64(d1); 1972} 1973 1974void HELPER(neon_qzip32)(CPUARMState *env, uint32_t rd, uint32_t rm) 1975{ 1976 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1977 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1978 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1979 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1980 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1981 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1982 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1983 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1984 env->vfp.regs[rm] = make_float64(m0); 1985 env->vfp.regs[rm + 1] = make_float64(m1); 1986 env->vfp.regs[rd] = make_float64(d0); 1987 env->vfp.regs[rd + 1] = make_float64(d1); 1988} 1989 1990void HELPER(neon_zip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1991{ 1992 uint64_t zm = float64_val(env->vfp.regs[rm]); 1993 uint64_t zd = float64_val(env->vfp.regs[rd]); 1994 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1995 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1996 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1997 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1998 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1999 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 2000 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 2001 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 2002 env->vfp.regs[rm] = make_float64(m0); 2003 env->vfp.regs[rd] = make_float64(d0); 2004} 2005 2006void HELPER(neon_zip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 2007{ 2008 uint64_t zm = float64_val(env->vfp.regs[rm]); 2009 uint64_t zd = float64_val(env->vfp.regs[rd]); 2010 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 2011 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 2012 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 2013 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 2014 env->vfp.regs[rm] = make_float64(m0); 2015 env->vfp.regs[rd] = make_float64(d0); 2016} 2017