1/*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licenced under the GNU GPL v2.
8 */
9#include <stdlib.h>
10#include <stdio.h>
11
12#include "cpu.h"
13#include "exec.h"
14#include "helper.h"
15
16#define SIGNBIT (uint32_t)0x80000000
17#define SIGNBIT64 ((uint64_t)1 << 63)
18
19#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20
21#define NFS (&env->vfp.standard_fp_status)
22
23#define NEON_TYPE1(name, type) \
24typedef struct \
25{ \
26    type v1; \
27} neon_##name;
28#ifdef HOST_WORDS_BIGENDIAN
29#define NEON_TYPE2(name, type) \
30typedef struct \
31{ \
32    type v2; \
33    type v1; \
34} neon_##name;
35#define NEON_TYPE4(name, type) \
36typedef struct \
37{ \
38    type v4; \
39    type v3; \
40    type v2; \
41    type v1; \
42} neon_##name;
43#else
44#define NEON_TYPE2(name, type) \
45typedef struct \
46{ \
47    type v1; \
48    type v2; \
49} neon_##name;
50#define NEON_TYPE4(name, type) \
51typedef struct \
52{ \
53    type v1; \
54    type v2; \
55    type v3; \
56    type v4; \
57} neon_##name;
58#endif
59
60NEON_TYPE4(s8, int8_t)
61NEON_TYPE4(u8, uint8_t)
62NEON_TYPE2(s16, int16_t)
63NEON_TYPE2(u16, uint16_t)
64NEON_TYPE1(s32, int32_t)
65NEON_TYPE1(u32, uint32_t)
66#undef NEON_TYPE4
67#undef NEON_TYPE2
68#undef NEON_TYPE1
69
70/* Copy from a uint32_t to a vector structure type.  */
71#define NEON_UNPACK(vtype, dest, val) do { \
72    union { \
73        vtype v; \
74        uint32_t i; \
75    } conv_u; \
76    conv_u.i = (val); \
77    dest = conv_u.v; \
78    } while(0)
79
80/* Copy from a vector structure type to a uint32_t.  */
81#define NEON_PACK(vtype, dest, val) do { \
82    union { \
83        vtype v; \
84        uint32_t i; \
85    } conv_u; \
86    conv_u.v = (val); \
87    dest = conv_u.i; \
88    } while(0)
89
90#define NEON_DO1 \
91    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
92#define NEON_DO2 \
93    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
94    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
95#define NEON_DO4 \
96    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
97    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
98    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
99    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
100
101#define NEON_VOP_BODY(vtype, n) \
102{ \
103    uint32_t res; \
104    vtype vsrc1; \
105    vtype vsrc2; \
106    vtype vdest; \
107    NEON_UNPACK(vtype, vsrc1, arg1); \
108    NEON_UNPACK(vtype, vsrc2, arg2); \
109    NEON_DO##n; \
110    NEON_PACK(vtype, res, vdest); \
111    return res; \
112}
113
114#define NEON_VOP(name, vtype, n) \
115uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
116NEON_VOP_BODY(vtype, n)
117
118/* Pairwise operations.  */
119/* For 32-bit elements each segment only contains a single element, so
120   the elementwise and pairwise operations are the same.  */
121#define NEON_PDO2 \
122    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
123    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
124#define NEON_PDO4 \
125    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
126    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
127    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
128    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
129
130#define NEON_POP(name, vtype, n) \
131uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
132{ \
133    uint32_t res; \
134    vtype vsrc1; \
135    vtype vsrc2; \
136    vtype vdest; \
137    NEON_UNPACK(vtype, vsrc1, arg1); \
138    NEON_UNPACK(vtype, vsrc2, arg2); \
139    NEON_PDO##n; \
140    NEON_PACK(vtype, res, vdest); \
141    return res; \
142}
143
144/* Unary operators.  */
145#define NEON_VOP1(name, vtype, n) \
146uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
147{ \
148    vtype vsrc1; \
149    vtype vdest; \
150    NEON_UNPACK(vtype, vsrc1, arg); \
151    NEON_DO##n; \
152    NEON_PACK(vtype, arg, vdest); \
153    return arg; \
154}
155
156
157#define NEON_USAT(dest, src1, src2, type) do { \
158    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
159    if (tmp != (type)tmp) { \
160        SET_QC(); \
161        dest = ~0; \
162    } else { \
163        dest = tmp; \
164    }} while(0)
165#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
166NEON_VOP(qadd_u8, neon_u8, 4)
167#undef NEON_FN
168#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
169NEON_VOP(qadd_u16, neon_u16, 2)
170#undef NEON_FN
171#undef NEON_USAT
172
173uint32_t HELPER(neon_qadd_u32)(uint32_t a, uint32_t b)
174{
175    uint32_t res = a + b;
176    if (res < a) {
177        SET_QC();
178        res = ~0;
179    }
180    return res;
181}
182
183uint64_t HELPER(neon_qadd_u64)(uint64_t src1, uint64_t src2)
184{
185    uint64_t res;
186
187    res = src1 + src2;
188    if (res < src1) {
189        SET_QC();
190        res = ~(uint64_t)0;
191    }
192    return res;
193}
194
195#define NEON_SSAT(dest, src1, src2, type) do { \
196    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
197    if (tmp != (type)tmp) { \
198        SET_QC(); \
199        if (src2 > 0) { \
200            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
201        } else { \
202            tmp = 1 << (sizeof(type) * 8 - 1); \
203        } \
204    } \
205    dest = tmp; \
206    } while(0)
207#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
208NEON_VOP(qadd_s8, neon_s8, 4)
209#undef NEON_FN
210#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
211NEON_VOP(qadd_s16, neon_s16, 2)
212#undef NEON_FN
213#undef NEON_SSAT
214
215uint32_t HELPER(neon_qadd_s32)(uint32_t a, uint32_t b)
216{
217    uint32_t res = a + b;
218    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
219        SET_QC();
220        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
221    }
222    return res;
223}
224
225uint64_t HELPER(neon_qadd_s64)(uint64_t src1, uint64_t src2)
226{
227    uint64_t res;
228
229    res = src1 + src2;
230    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
231        SET_QC();
232        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
233    }
234    return res;
235}
236
237#define NEON_USAT(dest, src1, src2, type) do { \
238    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
239    if (tmp != (type)tmp) { \
240        SET_QC(); \
241        dest = 0; \
242    } else { \
243        dest = tmp; \
244    }} while(0)
245#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
246NEON_VOP(qsub_u8, neon_u8, 4)
247#undef NEON_FN
248#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
249NEON_VOP(qsub_u16, neon_u16, 2)
250#undef NEON_FN
251#undef NEON_USAT
252
253uint32_t HELPER(neon_qsub_u32)(uint32_t a, uint32_t b)
254{
255    uint32_t res = a - b;
256    if (res > a) {
257        SET_QC();
258        res = 0;
259    }
260    return res;
261}
262
263uint64_t HELPER(neon_qsub_u64)(uint64_t src1, uint64_t src2)
264{
265    uint64_t res;
266
267    if (src1 < src2) {
268        SET_QC();
269        res = 0;
270    } else {
271        res = src1 - src2;
272    }
273    return res;
274}
275
276#define NEON_SSAT(dest, src1, src2, type) do { \
277    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
278    if (tmp != (type)tmp) { \
279        SET_QC(); \
280        if (src2 < 0) { \
281            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
282        } else { \
283            tmp = 1 << (sizeof(type) * 8 - 1); \
284        } \
285    } \
286    dest = tmp; \
287    } while(0)
288#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
289NEON_VOP(qsub_s8, neon_s8, 4)
290#undef NEON_FN
291#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
292NEON_VOP(qsub_s16, neon_s16, 2)
293#undef NEON_FN
294#undef NEON_SSAT
295
296uint32_t HELPER(neon_qsub_s32)(uint32_t a, uint32_t b)
297{
298    uint32_t res = a - b;
299    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
300        SET_QC();
301        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
302    }
303    return res;
304}
305
306uint64_t HELPER(neon_qsub_s64)(uint64_t src1, uint64_t src2)
307{
308    uint64_t res;
309
310    res = src1 - src2;
311    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
312        SET_QC();
313        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
314    }
315    return res;
316}
317
318#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
319NEON_VOP(hadd_s8, neon_s8, 4)
320NEON_VOP(hadd_u8, neon_u8, 4)
321NEON_VOP(hadd_s16, neon_s16, 2)
322NEON_VOP(hadd_u16, neon_u16, 2)
323#undef NEON_FN
324
325int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
326{
327    int32_t dest;
328
329    dest = (src1 >> 1) + (src2 >> 1);
330    if (src1 & src2 & 1)
331        dest++;
332    return dest;
333}
334
335uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
336{
337    uint32_t dest;
338
339    dest = (src1 >> 1) + (src2 >> 1);
340    if (src1 & src2 & 1)
341        dest++;
342    return dest;
343}
344
345#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
346NEON_VOP(rhadd_s8, neon_s8, 4)
347NEON_VOP(rhadd_u8, neon_u8, 4)
348NEON_VOP(rhadd_s16, neon_s16, 2)
349NEON_VOP(rhadd_u16, neon_u16, 2)
350#undef NEON_FN
351
352int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
353{
354    int32_t dest;
355
356    dest = (src1 >> 1) + (src2 >> 1);
357    if ((src1 | src2) & 1)
358        dest++;
359    return dest;
360}
361
362uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
363{
364    uint32_t dest;
365
366    dest = (src1 >> 1) + (src2 >> 1);
367    if ((src1 | src2) & 1)
368        dest++;
369    return dest;
370}
371
372#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
373NEON_VOP(hsub_s8, neon_s8, 4)
374NEON_VOP(hsub_u8, neon_u8, 4)
375NEON_VOP(hsub_s16, neon_s16, 2)
376NEON_VOP(hsub_u16, neon_u16, 2)
377#undef NEON_FN
378
379int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
380{
381    int32_t dest;
382
383    dest = (src1 >> 1) - (src2 >> 1);
384    if ((~src1) & src2 & 1)
385        dest--;
386    return dest;
387}
388
389uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
390{
391    uint32_t dest;
392
393    dest = (src1 >> 1) - (src2 >> 1);
394    if ((~src1) & src2 & 1)
395        dest--;
396    return dest;
397}
398
399#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
400NEON_VOP(cgt_s8, neon_s8, 4)
401NEON_VOP(cgt_u8, neon_u8, 4)
402NEON_VOP(cgt_s16, neon_s16, 2)
403NEON_VOP(cgt_u16, neon_u16, 2)
404NEON_VOP(cgt_s32, neon_s32, 1)
405NEON_VOP(cgt_u32, neon_u32, 1)
406#undef NEON_FN
407
408#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
409NEON_VOP(cge_s8, neon_s8, 4)
410NEON_VOP(cge_u8, neon_u8, 4)
411NEON_VOP(cge_s16, neon_s16, 2)
412NEON_VOP(cge_u16, neon_u16, 2)
413NEON_VOP(cge_s32, neon_s32, 1)
414NEON_VOP(cge_u32, neon_u32, 1)
415#undef NEON_FN
416
417#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
418NEON_VOP(min_s8, neon_s8, 4)
419NEON_VOP(min_u8, neon_u8, 4)
420NEON_VOP(min_s16, neon_s16, 2)
421NEON_VOP(min_u16, neon_u16, 2)
422NEON_VOP(min_s32, neon_s32, 1)
423NEON_VOP(min_u32, neon_u32, 1)
424NEON_POP(pmin_s8, neon_s8, 4)
425NEON_POP(pmin_u8, neon_u8, 4)
426NEON_POP(pmin_s16, neon_s16, 2)
427NEON_POP(pmin_u16, neon_u16, 2)
428#undef NEON_FN
429
430#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
431NEON_VOP(max_s8, neon_s8, 4)
432NEON_VOP(max_u8, neon_u8, 4)
433NEON_VOP(max_s16, neon_s16, 2)
434NEON_VOP(max_u16, neon_u16, 2)
435NEON_VOP(max_s32, neon_s32, 1)
436NEON_VOP(max_u32, neon_u32, 1)
437NEON_POP(pmax_s8, neon_s8, 4)
438NEON_POP(pmax_u8, neon_u8, 4)
439NEON_POP(pmax_s16, neon_s16, 2)
440NEON_POP(pmax_u16, neon_u16, 2)
441#undef NEON_FN
442
443#define NEON_FN(dest, src1, src2) \
444    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
445NEON_VOP(abd_s8, neon_s8, 4)
446NEON_VOP(abd_u8, neon_u8, 4)
447NEON_VOP(abd_s16, neon_s16, 2)
448NEON_VOP(abd_u16, neon_u16, 2)
449NEON_VOP(abd_s32, neon_s32, 1)
450NEON_VOP(abd_u32, neon_u32, 1)
451#undef NEON_FN
452
453#define NEON_FN(dest, src1, src2) do { \
454    int8_t tmp; \
455    tmp = (int8_t)src2; \
456    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
457        tmp <= -(ssize_t)sizeof(src1) * 8) { \
458        dest = 0; \
459    } else if (tmp < 0) { \
460        dest = src1 >> -tmp; \
461    } else { \
462        dest = src1 << tmp; \
463    }} while (0)
464NEON_VOP(shl_u8, neon_u8, 4)
465NEON_VOP(shl_u16, neon_u16, 2)
466NEON_VOP(shl_u32, neon_u32, 1)
467#undef NEON_FN
468
469uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
470{
471    int8_t shift = (int8_t)shiftop;
472    if (shift >= 64 || shift <= -64) {
473        val = 0;
474    } else if (shift < 0) {
475        val >>= -shift;
476    } else {
477        val <<= shift;
478    }
479    return val;
480}
481
482#define NEON_FN(dest, src1, src2) do { \
483    int8_t tmp; \
484    tmp = (int8_t)src2; \
485    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
486        dest = 0; \
487    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
488        dest = src1 >> (sizeof(src1) * 8 - 1); \
489    } else if (tmp < 0) { \
490        dest = src1 >> -tmp; \
491    } else { \
492        dest = src1 << tmp; \
493    }} while (0)
494NEON_VOP(shl_s8, neon_s8, 4)
495NEON_VOP(shl_s16, neon_s16, 2)
496NEON_VOP(shl_s32, neon_s32, 1)
497#undef NEON_FN
498
499uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
500{
501    int8_t shift = (int8_t)shiftop;
502    int64_t val = valop;
503    if (shift >= 64) {
504        val = 0;
505    } else if (shift <= -64) {
506        val >>= 63;
507    } else if (shift < 0) {
508        val >>= -shift;
509    } else {
510        val <<= shift;
511    }
512    return val;
513}
514
515#define NEON_FN(dest, src1, src2) do { \
516    int8_t tmp; \
517    tmp = (int8_t)src2; \
518    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
519        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
520        dest = 0; \
521    } else if (tmp < 0) { \
522        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
523    } else { \
524        dest = src1 << tmp; \
525    }} while (0)
526NEON_VOP(rshl_s8, neon_s8, 4)
527NEON_VOP(rshl_s16, neon_s16, 2)
528#undef NEON_FN
529
530/* The addition of the rounding constant may overflow, so we use an
531 * intermediate 64 bits accumulator.  */
532uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
533{
534    int32_t dest;
535    int32_t val = (int32_t)valop;
536    int8_t shift = (int8_t)shiftop;
537    if ((shift >= 32) || (shift <= -32)) {
538        dest = 0;
539    } else if (shift < 0) {
540        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
541        dest = big_dest >> -shift;
542    } else {
543        dest = val << shift;
544    }
545    return dest;
546}
547
548/* Handling addition overflow with 64 bits inputs values is more
549 * tricky than with 32 bits values.  */
550uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
551{
552    int8_t shift = (int8_t)shiftop;
553    int64_t val = valop;
554    if ((shift >= 64) || (shift <= -64)) {
555        val = 0;
556    } else if (shift < 0) {
557        val >>= (-shift - 1);
558        if (val == INT64_MAX) {
559            /* In this case, it means that the rounding constant is 1,
560             * and the addition would overflow. Return the actual
561             * result directly.  */
562            val = 0x4000000000000000LL;
563        } else {
564            val++;
565            val >>= 1;
566        }
567    } else {
568        val <<= shift;
569    }
570    return val;
571}
572
573#define NEON_FN(dest, src1, src2) do { \
574    int8_t tmp; \
575    tmp = (int8_t)src2; \
576    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
577        tmp < -(ssize_t)sizeof(src1) * 8) { \
578        dest = 0; \
579    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
580        dest = src1 >> (-tmp - 1); \
581    } else if (tmp < 0) { \
582        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
583    } else { \
584        dest = src1 << tmp; \
585    }} while (0)
586NEON_VOP(rshl_u8, neon_u8, 4)
587NEON_VOP(rshl_u16, neon_u16, 2)
588#undef NEON_FN
589
590/* The addition of the rounding constant may overflow, so we use an
591 * intermediate 64 bits accumulator.  */
592uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
593{
594    uint32_t dest;
595    int8_t shift = (int8_t)shiftop;
596    if (shift >= 32 || shift < -32) {
597        dest = 0;
598    } else if (shift == -32) {
599        dest = val >> 31;
600    } else if (shift < 0) {
601        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
602        dest = big_dest >> -shift;
603    } else {
604        dest = val << shift;
605    }
606    return dest;
607}
608
609/* Handling addition overflow with 64 bits inputs values is more
610 * tricky than with 32 bits values.  */
611uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
612{
613    int8_t shift = (uint8_t)shiftop;
614    if (shift >= 64 || shift < -64) {
615        val = 0;
616    } else if (shift == -64) {
617        /* Rounding a 1-bit result just preserves that bit.  */
618        val >>= 63;
619    } else if (shift < 0) {
620        val >>= (-shift - 1);
621        if (val == UINT64_MAX) {
622            /* In this case, it means that the rounding constant is 1,
623             * and the addition would overflow. Return the actual
624             * result directly.  */
625            val = 0x8000000000000000ULL;
626        } else {
627            val++;
628            val >>= 1;
629        }
630    } else {
631        val <<= shift;
632    }
633    return val;
634}
635
636#define NEON_FN(dest, src1, src2) do { \
637    int8_t tmp; \
638    tmp = (int8_t)src2; \
639    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
640        if (src1) { \
641            SET_QC(); \
642            dest = ~0; \
643        } else { \
644            dest = 0; \
645        } \
646    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
647        dest = 0; \
648    } else if (tmp < 0) { \
649        dest = src1 >> -tmp; \
650    } else { \
651        dest = src1 << tmp; \
652        if ((dest >> tmp) != src1) { \
653            SET_QC(); \
654            dest = ~0; \
655        } \
656    }} while (0)
657NEON_VOP(qshl_u8, neon_u8, 4)
658NEON_VOP(qshl_u16, neon_u16, 2)
659NEON_VOP(qshl_u32, neon_u32, 1)
660#undef NEON_FN
661
662uint64_t HELPER(neon_qshl_u64)(uint64_t val, uint64_t shiftop)
663{
664    int8_t shift = (int8_t)shiftop;
665    if (shift >= 64) {
666        if (val) {
667            val = ~(uint64_t)0;
668            SET_QC();
669        }
670    } else if (shift <= -64) {
671        val = 0;
672    } else if (shift < 0) {
673        val >>= -shift;
674    } else {
675        uint64_t tmp = val;
676        val <<= shift;
677        if ((val >> shift) != tmp) {
678            SET_QC();
679            val = ~(uint64_t)0;
680        }
681    }
682    return val;
683}
684
685#define NEON_FN(dest, src1, src2) do { \
686    int8_t tmp; \
687    tmp = (int8_t)src2; \
688    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
689        if (src1) { \
690            SET_QC(); \
691            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
692            if (src1 > 0) { \
693                dest--; \
694            } \
695        } else { \
696            dest = src1; \
697        } \
698    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
699        dest = src1 >> 31; \
700    } else if (tmp < 0) { \
701        dest = src1 >> -tmp; \
702    } else { \
703        dest = src1 << tmp; \
704        if ((dest >> tmp) != src1) { \
705            SET_QC(); \
706            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
707            if (src1 > 0) { \
708                dest--; \
709            } \
710        } \
711    }} while (0)
712NEON_VOP(qshl_s8, neon_s8, 4)
713NEON_VOP(qshl_s16, neon_s16, 2)
714NEON_VOP(qshl_s32, neon_s32, 1)
715#undef NEON_FN
716
717uint64_t HELPER(neon_qshl_s64)(uint64_t valop, uint64_t shiftop)
718{
719    int8_t shift = (uint8_t)shiftop;
720    int64_t val = valop;
721    if (shift >= 64) {
722        if (val) {
723            SET_QC();
724            val = (val >> 63) ^ ~SIGNBIT64;
725        }
726    } else if (shift <= -64) {
727        val >>= 63;
728    } else if (shift < 0) {
729        val >>= -shift;
730    } else {
731        int64_t tmp = val;
732        val <<= shift;
733        if ((val >> shift) != tmp) {
734            SET_QC();
735            val = (tmp >> 63) ^ ~SIGNBIT64;
736        }
737    }
738    return val;
739}
740
741#define NEON_FN(dest, src1, src2) do { \
742    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
743        SET_QC(); \
744        dest = 0; \
745    } else { \
746        int8_t tmp; \
747        tmp = (int8_t)src2; \
748        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
749            if (src1) { \
750                SET_QC(); \
751                dest = ~0; \
752            } else { \
753                dest = 0; \
754            } \
755        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
756            dest = 0; \
757        } else if (tmp < 0) { \
758            dest = src1 >> -tmp; \
759        } else { \
760            dest = src1 << tmp; \
761            if ((dest >> tmp) != src1) { \
762                SET_QC(); \
763                dest = ~0; \
764            } \
765        } \
766    }} while (0)
767NEON_VOP(qshlu_s8, neon_u8, 4)
768NEON_VOP(qshlu_s16, neon_u16, 2)
769#undef NEON_FN
770
771uint32_t HELPER(neon_qshlu_s32)(uint32_t valop, uint32_t shiftop)
772{
773    if ((int32_t)valop < 0) {
774        SET_QC();
775        return 0;
776    }
777    return helper_neon_qshl_u32(valop, shiftop);
778}
779
780uint64_t HELPER(neon_qshlu_s64)(uint64_t valop, uint64_t shiftop)
781{
782    if ((int64_t)valop < 0) {
783        SET_QC();
784        return 0;
785    }
786    return helper_neon_qshl_u64(valop, shiftop);
787}
788
789/* FIXME: This is wrong.  */
790#define NEON_FN(dest, src1, src2) do { \
791    int8_t tmp; \
792    tmp = (int8_t)src2; \
793    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
794        if (src1) { \
795            SET_QC(); \
796            dest = ~0; \
797        } else { \
798            dest = 0; \
799        } \
800    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
801        dest = 0; \
802    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
803        dest = src1 >> (sizeof(src1) * 8 - 1); \
804    } else if (tmp < 0) { \
805        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
806    } else { \
807        dest = src1 << tmp; \
808        if ((dest >> tmp) != src1) { \
809            SET_QC(); \
810            dest = ~0; \
811        } \
812    }} while (0)
813NEON_VOP(qrshl_u8, neon_u8, 4)
814NEON_VOP(qrshl_u16, neon_u16, 2)
815#undef NEON_FN
816
817/* The addition of the rounding constant may overflow, so we use an
818 * intermediate 64 bits accumulator.  */
819uint32_t HELPER(neon_qrshl_u32)(uint32_t val, uint32_t shiftop)
820{
821    uint32_t dest;
822    int8_t shift = (int8_t)shiftop;
823    if (shift >= 32) {
824        if (val) {
825            SET_QC();
826            dest = ~0;
827        } else {
828            dest = 0;
829        }
830    } else if (shift < -32) {
831        dest = 0;
832    } else if (shift == -32) {
833        dest = val >> 31;
834    } else if (shift < 0) {
835        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
836        dest = big_dest >> -shift;
837    } else {
838        dest = val << shift;
839        if ((dest >> shift) != val) {
840            SET_QC();
841            dest = ~0;
842        }
843    }
844    return dest;
845}
846
847/* Handling addition overflow with 64 bits inputs values is more
848 * tricky than with 32 bits values.  */
849uint64_t HELPER(neon_qrshl_u64)(uint64_t val, uint64_t shiftop)
850{
851    int8_t shift = (int8_t)shiftop;
852    if (shift >= 64) {
853        if (val) {
854            SET_QC();
855            val = ~0;
856        }
857    } else if (shift < -64) {
858        val = 0;
859    } else if (shift == -64) {
860        val >>= 63;
861    } else if (shift < 0) {
862        val >>= (-shift - 1);
863        if (val == UINT64_MAX) {
864            /* In this case, it means that the rounding constant is 1,
865             * and the addition would overflow. Return the actual
866             * result directly.  */
867            val = 0x8000000000000000ULL;
868        } else {
869            val++;
870            val >>= 1;
871        }
872    } else { \
873        uint64_t tmp = val;
874        val <<= shift;
875        if ((val >> shift) != tmp) {
876            SET_QC();
877            val = ~0;
878        }
879    }
880    return val;
881}
882
883#define NEON_FN(dest, src1, src2) do { \
884    int8_t tmp; \
885    tmp = (int8_t)src2; \
886    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
887        if (src1) { \
888            SET_QC(); \
889            dest = (1 << (sizeof(src1) * 8 - 1)); \
890            if (src1 > 0) { \
891                dest--; \
892            } \
893        } else { \
894            dest = 0; \
895        } \
896    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
897        dest = 0; \
898    } else if (tmp < 0) { \
899        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
900    } else { \
901        dest = src1 << tmp; \
902        if ((dest >> tmp) != src1) { \
903            SET_QC(); \
904            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
905            if (src1 > 0) { \
906                dest--; \
907            } \
908        } \
909    }} while (0)
910NEON_VOP(qrshl_s8, neon_s8, 4)
911NEON_VOP(qrshl_s16, neon_s16, 2)
912#undef NEON_FN
913
914/* The addition of the rounding constant may overflow, so we use an
915 * intermediate 64 bits accumulator.  */
916uint32_t HELPER(neon_qrshl_s32)(uint32_t valop, uint32_t shiftop)
917{
918    int32_t dest;
919    int32_t val = (int32_t)valop;
920    int8_t shift = (int8_t)shiftop;
921    if (shift >= 32) {
922        if (val) {
923            SET_QC();
924            dest = (val >> 31) ^ ~SIGNBIT;
925        } else {
926            dest = 0;
927        }
928    } else if (shift <= -32) {
929        dest = 0;
930    } else if (shift < 0) {
931        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
932        dest = big_dest >> -shift;
933    } else {
934        dest = val << shift;
935        if ((dest >> shift) != val) {
936            SET_QC();
937            dest = (val >> 31) ^ ~SIGNBIT;
938        }
939    }
940    return dest;
941}
942
943/* Handling addition overflow with 64 bits inputs values is more
944 * tricky than with 32 bits values.  */
945uint64_t HELPER(neon_qrshl_s64)(uint64_t valop, uint64_t shiftop)
946{
947    int8_t shift = (uint8_t)shiftop;
948    int64_t val = valop;
949
950    if (shift >= 64) {
951        if (val) {
952            SET_QC();
953            val = (val >> 63) ^ ~SIGNBIT64;
954        }
955    } else if (shift <= -64) {
956        val = 0;
957    } else if (shift < 0) {
958        val >>= (-shift - 1);
959        if (val == INT64_MAX) {
960            /* In this case, it means that the rounding constant is 1,
961             * and the addition would overflow. Return the actual
962             * result directly.  */
963            val = 0x4000000000000000ULL;
964        } else {
965            val++;
966            val >>= 1;
967        }
968    } else {
969        int64_t tmp = val;
970        val <<= shift;
971        if ((val >> shift) != tmp) {
972            SET_QC();
973            val = (tmp >> 63) ^ ~SIGNBIT64;
974        }
975    }
976    return val;
977}
978
979uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
980{
981    uint32_t mask;
982    mask = (a ^ b) & 0x80808080u;
983    a &= ~0x80808080u;
984    b &= ~0x80808080u;
985    return (a + b) ^ mask;
986}
987
988uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
989{
990    uint32_t mask;
991    mask = (a ^ b) & 0x80008000u;
992    a &= ~0x80008000u;
993    b &= ~0x80008000u;
994    return (a + b) ^ mask;
995}
996
997#define NEON_FN(dest, src1, src2) dest = src1 + src2
998NEON_POP(padd_u8, neon_u8, 4)
999NEON_POP(padd_u16, neon_u16, 2)
1000#undef NEON_FN
1001
1002#define NEON_FN(dest, src1, src2) dest = src1 - src2
1003NEON_VOP(sub_u8, neon_u8, 4)
1004NEON_VOP(sub_u16, neon_u16, 2)
1005#undef NEON_FN
1006
1007#define NEON_FN(dest, src1, src2) dest = src1 * src2
1008NEON_VOP(mul_u8, neon_u8, 4)
1009NEON_VOP(mul_u16, neon_u16, 2)
1010#undef NEON_FN
1011
1012/* Polynomial multiplication is like integer multiplication except the
1013   partial products are XORed, not added.  */
1014uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
1015{
1016    uint32_t mask;
1017    uint32_t result;
1018    result = 0;
1019    while (op1) {
1020        mask = 0;
1021        if (op1 & 1)
1022            mask |= 0xff;
1023        if (op1 & (1 << 8))
1024            mask |= (0xff << 8);
1025        if (op1 & (1 << 16))
1026            mask |= (0xff << 16);
1027        if (op1 & (1 << 24))
1028            mask |= (0xff << 24);
1029        result ^= op2 & mask;
1030        op1 = (op1 >> 1) & 0x7f7f7f7f;
1031        op2 = (op2 << 1) & 0xfefefefe;
1032    }
1033    return result;
1034}
1035
1036uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1037{
1038    uint64_t result = 0;
1039    uint64_t mask;
1040    uint64_t op2ex = op2;
1041    op2ex = (op2ex & 0xff) |
1042        ((op2ex & 0xff00) << 8) |
1043        ((op2ex & 0xff0000) << 16) |
1044        ((op2ex & 0xff000000) << 24);
1045    while (op1) {
1046        mask = 0;
1047        if (op1 & 1) {
1048            mask |= 0xffff;
1049        }
1050        if (op1 & (1 << 8)) {
1051            mask |= (0xffffU << 16);
1052        }
1053        if (op1 & (1 << 16)) {
1054            mask |= (0xffffULL << 32);
1055        }
1056        if (op1 & (1 << 24)) {
1057            mask |= (0xffffULL << 48);
1058        }
1059        result ^= op2ex & mask;
1060        op1 = (op1 >> 1) & 0x7f7f7f7f;
1061        op2ex <<= 1;
1062    }
1063    return result;
1064}
1065
1066#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1067NEON_VOP(tst_u8, neon_u8, 4)
1068NEON_VOP(tst_u16, neon_u16, 2)
1069NEON_VOP(tst_u32, neon_u32, 1)
1070#undef NEON_FN
1071
1072#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1073NEON_VOP(ceq_u8, neon_u8, 4)
1074NEON_VOP(ceq_u16, neon_u16, 2)
1075NEON_VOP(ceq_u32, neon_u32, 1)
1076#undef NEON_FN
1077
1078#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1079NEON_VOP1(abs_s8, neon_s8, 4)
1080NEON_VOP1(abs_s16, neon_s16, 2)
1081#undef NEON_FN
1082
1083/* Count Leading Sign/Zero Bits.  */
1084static inline int do_clz8(uint8_t x)
1085{
1086    int n;
1087    for (n = 8; x; n--)
1088        x >>= 1;
1089    return n;
1090}
1091
1092static inline int do_clz16(uint16_t x)
1093{
1094    int n;
1095    for (n = 16; x; n--)
1096        x >>= 1;
1097    return n;
1098}
1099
1100#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1101NEON_VOP1(clz_u8, neon_u8, 4)
1102#undef NEON_FN
1103
1104#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1105NEON_VOP1(clz_u16, neon_u16, 2)
1106#undef NEON_FN
1107
1108#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1109NEON_VOP1(cls_s8, neon_s8, 4)
1110#undef NEON_FN
1111
1112#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1113NEON_VOP1(cls_s16, neon_s16, 2)
1114#undef NEON_FN
1115
1116uint32_t HELPER(neon_cls_s32)(uint32_t x)
1117{
1118    int count;
1119    if ((int32_t)x < 0)
1120        x = ~x;
1121    for (count = 32; x; count--)
1122        x = x >> 1;
1123    return count - 1;
1124}
1125
1126/* Bit count.  */
1127uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1128{
1129    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1130    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1131    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1132    return x;
1133}
1134
1135#define NEON_QDMULH16(dest, src1, src2, round) do { \
1136    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1137    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1138        SET_QC(); \
1139        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1140    } else { \
1141        tmp <<= 1; \
1142    } \
1143    if (round) { \
1144        int32_t old = tmp; \
1145        tmp += 1 << 15; \
1146        if ((int32_t)tmp < old) { \
1147            SET_QC(); \
1148            tmp = SIGNBIT - 1; \
1149        } \
1150    } \
1151    dest = tmp >> 16; \
1152    } while(0)
1153#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1154NEON_VOP(qdmulh_s16, neon_s16, 2)
1155#undef NEON_FN
1156#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1157NEON_VOP(qrdmulh_s16, neon_s16, 2)
1158#undef NEON_FN
1159#undef NEON_QDMULH16
1160
1161#define NEON_QDMULH32(dest, src1, src2, round) do { \
1162    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1163    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1164        SET_QC(); \
1165        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1166    } else { \
1167        tmp <<= 1; \
1168    } \
1169    if (round) { \
1170        int64_t old = tmp; \
1171        tmp += (int64_t)1 << 31; \
1172        if ((int64_t)tmp < old) { \
1173            SET_QC(); \
1174            tmp = SIGNBIT64 - 1; \
1175        } \
1176    } \
1177    dest = tmp >> 32; \
1178    } while(0)
1179#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1180NEON_VOP(qdmulh_s32, neon_s32, 1)
1181#undef NEON_FN
1182#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1183NEON_VOP(qrdmulh_s32, neon_s32, 1)
1184#undef NEON_FN
1185#undef NEON_QDMULH32
1186
1187uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1188{
1189    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1190           | ((x >> 24) & 0xff000000u);
1191}
1192
1193uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1194{
1195    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1196}
1197
1198uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1199{
1200    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1201            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1202}
1203
1204uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1205{
1206    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1207}
1208
1209uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1210{
1211    x &= 0xff80ff80ff80ff80ull;
1212    x += 0x0080008000800080ull;
1213    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1214            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1215}
1216
1217uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1218{
1219    x &= 0xffff8000ffff8000ull;
1220    x += 0x0000800000008000ull;
1221    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1222}
1223
1224uint32_t HELPER(neon_unarrow_sat8)(uint64_t x)
1225{
1226    uint16_t s;
1227    uint8_t d;
1228    uint32_t res = 0;
1229#define SAT8(n) \
1230    s = x >> n; \
1231    if (s & 0x8000) { \
1232        SET_QC(); \
1233    } else { \
1234        if (s > 0xff) { \
1235            d = 0xff; \
1236            SET_QC(); \
1237        } else  { \
1238            d = s; \
1239        } \
1240        res |= (uint32_t)d << (n / 2); \
1241    }
1242
1243    SAT8(0);
1244    SAT8(16);
1245    SAT8(32);
1246    SAT8(48);
1247#undef SAT8
1248    return res;
1249}
1250
1251uint32_t HELPER(neon_narrow_sat_u8)(uint64_t x)
1252{
1253    uint16_t s;
1254    uint8_t d;
1255    uint32_t res = 0;
1256#define SAT8(n) \
1257    s = x >> n; \
1258    if (s > 0xff) { \
1259        d = 0xff; \
1260        SET_QC(); \
1261    } else  { \
1262        d = s; \
1263    } \
1264    res |= (uint32_t)d << (n / 2);
1265
1266    SAT8(0);
1267    SAT8(16);
1268    SAT8(32);
1269    SAT8(48);
1270#undef SAT8
1271    return res;
1272}
1273
1274uint32_t HELPER(neon_narrow_sat_s8)(uint64_t x)
1275{
1276    int16_t s;
1277    uint8_t d;
1278    uint32_t res = 0;
1279#define SAT8(n) \
1280    s = x >> n; \
1281    if (s != (int8_t)s) { \
1282        d = (s >> 15) ^ 0x7f; \
1283        SET_QC(); \
1284    } else  { \
1285        d = s; \
1286    } \
1287    res |= (uint32_t)d << (n / 2);
1288
1289    SAT8(0);
1290    SAT8(16);
1291    SAT8(32);
1292    SAT8(48);
1293#undef SAT8
1294    return res;
1295}
1296
1297uint32_t HELPER(neon_unarrow_sat16)(uint64_t x)
1298{
1299    uint32_t high;
1300    uint32_t low;
1301    low = x;
1302    if (low & 0x80000000) {
1303        low = 0;
1304        SET_QC();
1305    } else if (low > 0xffff) {
1306        low = 0xffff;
1307        SET_QC();
1308    }
1309    high = x >> 32;
1310    if (high & 0x80000000) {
1311        high = 0;
1312        SET_QC();
1313    } else if (high > 0xffff) {
1314        high = 0xffff;
1315        SET_QC();
1316    }
1317    return low | (high << 16);
1318}
1319
1320uint32_t HELPER(neon_narrow_sat_u16)(uint64_t x)
1321{
1322    uint32_t high;
1323    uint32_t low;
1324    low = x;
1325    if (low > 0xffff) {
1326        low = 0xffff;
1327        SET_QC();
1328    }
1329    high = x >> 32;
1330    if (high > 0xffff) {
1331        high = 0xffff;
1332        SET_QC();
1333    }
1334    return low | (high << 16);
1335}
1336
1337uint32_t HELPER(neon_narrow_sat_s16)(uint64_t x)
1338{
1339    int32_t low;
1340    int32_t high;
1341    low = x;
1342    if (low != (int16_t)low) {
1343        low = (low >> 31) ^ 0x7fff;
1344        SET_QC();
1345    }
1346    high = x >> 32;
1347    if (high != (int16_t)high) {
1348        high = (high >> 31) ^ 0x7fff;
1349        SET_QC();
1350    }
1351    return (uint16_t)low | (high << 16);
1352}
1353
1354uint32_t HELPER(neon_unarrow_sat32)(uint64_t x)
1355{
1356    if (x & 0x8000000000000000ull) {
1357        SET_QC();
1358        return 0;
1359    }
1360    if (x > 0xffffffffu) {
1361        SET_QC();
1362        return 0xffffffffu;
1363    }
1364    return x;
1365}
1366
1367uint32_t HELPER(neon_narrow_sat_u32)(uint64_t x)
1368{
1369    if (x > 0xffffffffu) {
1370        SET_QC();
1371        return 0xffffffffu;
1372    }
1373    return x;
1374}
1375
1376uint32_t HELPER(neon_narrow_sat_s32)(uint64_t x)
1377{
1378    if ((int64_t)x != (int32_t)x) {
1379        SET_QC();
1380        return ((int64_t)x >> 63) ^ 0x7fffffff;
1381    }
1382    return x;
1383}
1384
1385uint64_t HELPER(neon_widen_u8)(uint32_t x)
1386{
1387    uint64_t tmp;
1388    uint64_t ret;
1389    ret = (uint8_t)x;
1390    tmp = (uint8_t)(x >> 8);
1391    ret |= tmp << 16;
1392    tmp = (uint8_t)(x >> 16);
1393    ret |= tmp << 32;
1394    tmp = (uint8_t)(x >> 24);
1395    ret |= tmp << 48;
1396    return ret;
1397}
1398
1399uint64_t HELPER(neon_widen_s8)(uint32_t x)
1400{
1401    uint64_t tmp;
1402    uint64_t ret;
1403    ret = (uint16_t)(int8_t)x;
1404    tmp = (uint16_t)(int8_t)(x >> 8);
1405    ret |= tmp << 16;
1406    tmp = (uint16_t)(int8_t)(x >> 16);
1407    ret |= tmp << 32;
1408    tmp = (uint16_t)(int8_t)(x >> 24);
1409    ret |= tmp << 48;
1410    return ret;
1411}
1412
1413uint64_t HELPER(neon_widen_u16)(uint32_t x)
1414{
1415    uint64_t high = (uint16_t)(x >> 16);
1416    return ((uint16_t)x) | (high << 32);
1417}
1418
1419uint64_t HELPER(neon_widen_s16)(uint32_t x)
1420{
1421    uint64_t high = (int16_t)(x >> 16);
1422    return ((uint32_t)(int16_t)x) | (high << 32);
1423}
1424
1425uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1426{
1427    uint64_t mask;
1428    mask = (a ^ b) & 0x8000800080008000ull;
1429    a &= ~0x8000800080008000ull;
1430    b &= ~0x8000800080008000ull;
1431    return (a + b) ^ mask;
1432}
1433
1434uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1435{
1436    uint64_t mask;
1437    mask = (a ^ b) & 0x8000000080000000ull;
1438    a &= ~0x8000000080000000ull;
1439    b &= ~0x8000000080000000ull;
1440    return (a + b) ^ mask;
1441}
1442
1443uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1444{
1445    uint64_t tmp;
1446    uint64_t tmp2;
1447
1448    tmp = a & 0x0000ffff0000ffffull;
1449    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1450    tmp2 = b & 0xffff0000ffff0000ull;
1451    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1452    return    ( tmp         & 0xffff)
1453            | ((tmp  >> 16) & 0xffff0000ull)
1454            | ((tmp2 << 16) & 0xffff00000000ull)
1455            | ( tmp2        & 0xffff000000000000ull);
1456}
1457
1458uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1459{
1460    uint32_t low = a + (a >> 32);
1461    uint32_t high = b + (b >> 32);
1462    return low + ((uint64_t)high << 32);
1463}
1464
1465uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1466{
1467    uint64_t mask;
1468    mask = (a ^ ~b) & 0x8000800080008000ull;
1469    a |= 0x8000800080008000ull;
1470    b &= ~0x8000800080008000ull;
1471    return (a - b) ^ mask;
1472}
1473
1474uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1475{
1476    uint64_t mask;
1477    mask = (a ^ ~b) & 0x8000000080000000ull;
1478    a |= 0x8000000080000000ull;
1479    b &= ~0x8000000080000000ull;
1480    return (a - b) ^ mask;
1481}
1482
1483uint64_t HELPER(neon_addl_saturate_s32)(uint64_t a, uint64_t b)
1484{
1485    uint32_t x, y;
1486    uint32_t low, high;
1487
1488    x = a;
1489    y = b;
1490    low = x + y;
1491    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1492        SET_QC();
1493        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1494    }
1495    x = a >> 32;
1496    y = b >> 32;
1497    high = x + y;
1498    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1499        SET_QC();
1500        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1501    }
1502    return low | ((uint64_t)high << 32);
1503}
1504
1505uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b)
1506{
1507    uint64_t result;
1508
1509    result = a + b;
1510    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1511        SET_QC();
1512        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1513    }
1514    return result;
1515}
1516
1517/* We have to do the arithmetic in a larger type than
1518 * the input type, because for example with a signed 32 bit
1519 * op the absolute difference can overflow a signed 32 bit value.
1520 */
1521#define DO_ABD(dest, x, y, intype, arithtype) do {            \
1522    arithtype tmp_x = (intype)(x);                            \
1523    arithtype tmp_y = (intype)(y);                            \
1524    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1525    } while(0)
1526
1527uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1528{
1529    uint64_t tmp;
1530    uint64_t result;
1531    DO_ABD(result, a, b, uint8_t, uint32_t);
1532    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1533    result |= tmp << 16;
1534    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1535    result |= tmp << 32;
1536    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1537    result |= tmp << 48;
1538    return result;
1539}
1540
1541uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1542{
1543    uint64_t tmp;
1544    uint64_t result;
1545    DO_ABD(result, a, b, int8_t, int32_t);
1546    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1547    result |= tmp << 16;
1548    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1549    result |= tmp << 32;
1550    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1551    result |= tmp << 48;
1552    return result;
1553}
1554
1555uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1556{
1557    uint64_t tmp;
1558    uint64_t result;
1559    DO_ABD(result, a, b, uint16_t, uint32_t);
1560    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1561    return result | (tmp << 32);
1562}
1563
1564uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1565{
1566    uint64_t tmp;
1567    uint64_t result;
1568    DO_ABD(result, a, b, int16_t, int32_t);
1569    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1570    return result | (tmp << 32);
1571}
1572
1573uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1574{
1575    uint64_t result;
1576    DO_ABD(result, a, b, uint32_t, uint64_t);
1577    return result;
1578}
1579
1580uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1581{
1582    uint64_t result;
1583    DO_ABD(result, a, b, int32_t, int64_t);
1584    return result;
1585}
1586#undef DO_ABD
1587
1588/* Widening multiply. Named type is the source type.  */
1589#define DO_MULL(dest, x, y, type1, type2) do { \
1590    type1 tmp_x = x; \
1591    type1 tmp_y = y; \
1592    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1593    } while(0)
1594
1595uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1596{
1597    uint64_t tmp;
1598    uint64_t result;
1599
1600    DO_MULL(result, a, b, uint8_t, uint16_t);
1601    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1602    result |= tmp << 16;
1603    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1604    result |= tmp << 32;
1605    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1606    result |= tmp << 48;
1607    return result;
1608}
1609
1610uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1611{
1612    uint64_t tmp;
1613    uint64_t result;
1614
1615    DO_MULL(result, a, b, int8_t, uint16_t);
1616    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1617    result |= tmp << 16;
1618    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1619    result |= tmp << 32;
1620    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1621    result |= tmp << 48;
1622    return result;
1623}
1624
1625uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1626{
1627    uint64_t tmp;
1628    uint64_t result;
1629
1630    DO_MULL(result, a, b, uint16_t, uint32_t);
1631    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1632    return result | (tmp << 32);
1633}
1634
1635uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1636{
1637    uint64_t tmp;
1638    uint64_t result;
1639
1640    DO_MULL(result, a, b, int16_t, uint32_t);
1641    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1642    return result | (tmp << 32);
1643}
1644
1645uint64_t HELPER(neon_negl_u16)(uint64_t x)
1646{
1647    uint16_t tmp;
1648    uint64_t result;
1649    result = (uint16_t)-x;
1650    tmp = -(x >> 16);
1651    result |= (uint64_t)tmp << 16;
1652    tmp = -(x >> 32);
1653    result |= (uint64_t)tmp << 32;
1654    tmp = -(x >> 48);
1655    result |= (uint64_t)tmp << 48;
1656    return result;
1657}
1658
1659uint64_t HELPER(neon_negl_u32)(uint64_t x)
1660{
1661    uint32_t low = -x;
1662    uint32_t high = -(x >> 32);
1663    return low | ((uint64_t)high << 32);
1664}
1665
1666/* FIXME:  There should be a native op for this.  */
1667uint64_t HELPER(neon_negl_u64)(uint64_t x)
1668{
1669    return -x;
1670}
1671
1672/* Saturnating sign manuipulation.  */
1673/* ??? Make these use NEON_VOP1 */
1674#define DO_QABS8(x) do { \
1675    if (x == (int8_t)0x80) { \
1676        x = 0x7f; \
1677        SET_QC(); \
1678    } else if (x < 0) { \
1679        x = -x; \
1680    }} while (0)
1681uint32_t HELPER(neon_qabs_s8)(uint32_t x)
1682{
1683    neon_s8 vec;
1684    NEON_UNPACK(neon_s8, vec, x);
1685    DO_QABS8(vec.v1);
1686    DO_QABS8(vec.v2);
1687    DO_QABS8(vec.v3);
1688    DO_QABS8(vec.v4);
1689    NEON_PACK(neon_s8, x, vec);
1690    return x;
1691}
1692#undef DO_QABS8
1693
1694#define DO_QNEG8(x) do { \
1695    if (x == (int8_t)0x80) { \
1696        x = 0x7f; \
1697        SET_QC(); \
1698    } else { \
1699        x = -x; \
1700    }} while (0)
1701uint32_t HELPER(neon_qneg_s8)(uint32_t x)
1702{
1703    neon_s8 vec;
1704    NEON_UNPACK(neon_s8, vec, x);
1705    DO_QNEG8(vec.v1);
1706    DO_QNEG8(vec.v2);
1707    DO_QNEG8(vec.v3);
1708    DO_QNEG8(vec.v4);
1709    NEON_PACK(neon_s8, x, vec);
1710    return x;
1711}
1712#undef DO_QNEG8
1713
1714#define DO_QABS16(x) do { \
1715    if (x == (int16_t)0x8000) { \
1716        x = 0x7fff; \
1717        SET_QC(); \
1718    } else if (x < 0) { \
1719        x = -x; \
1720    }} while (0)
1721uint32_t HELPER(neon_qabs_s16)(uint32_t x)
1722{
1723    neon_s16 vec;
1724    NEON_UNPACK(neon_s16, vec, x);
1725    DO_QABS16(vec.v1);
1726    DO_QABS16(vec.v2);
1727    NEON_PACK(neon_s16, x, vec);
1728    return x;
1729}
1730#undef DO_QABS16
1731
1732#define DO_QNEG16(x) do { \
1733    if (x == (int16_t)0x8000) { \
1734        x = 0x7fff; \
1735        SET_QC(); \
1736    } else { \
1737        x = -x; \
1738    }} while (0)
1739uint32_t HELPER(neon_qneg_s16)(uint32_t x)
1740{
1741    neon_s16 vec;
1742    NEON_UNPACK(neon_s16, vec, x);
1743    DO_QNEG16(vec.v1);
1744    DO_QNEG16(vec.v2);
1745    NEON_PACK(neon_s16, x, vec);
1746    return x;
1747}
1748#undef DO_QNEG16
1749
1750uint32_t HELPER(neon_qabs_s32)(uint32_t x)
1751{
1752    if (x == SIGNBIT) {
1753        SET_QC();
1754        x = ~SIGNBIT;
1755    } else if ((int32_t)x < 0) {
1756        x = -x;
1757    }
1758    return x;
1759}
1760
1761uint32_t HELPER(neon_qneg_s32)(uint32_t x)
1762{
1763    if (x == SIGNBIT) {
1764        SET_QC();
1765        x = ~SIGNBIT;
1766    } else {
1767        x = -x;
1768    }
1769    return x;
1770}
1771
1772/* NEON Float helpers.  */
1773uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
1774{
1775    return float32_val(float32_min(make_float32(a), make_float32(b), NFS));
1776}
1777
1778uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
1779{
1780    return float32_val(float32_max(make_float32(a), make_float32(b), NFS));
1781}
1782
1783uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
1784{
1785    float32 f0 = make_float32(a);
1786    float32 f1 = make_float32(b);
1787    return float32_val(float32_abs(float32_sub(f0, f1, NFS)));
1788}
1789
1790uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
1791{
1792    return float32_val(float32_add(make_float32(a), make_float32(b), NFS));
1793}
1794
1795uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
1796{
1797    return float32_val(float32_sub(make_float32(a), make_float32(b), NFS));
1798}
1799
1800uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
1801{
1802    return float32_val(float32_mul(make_float32(a), make_float32(b), NFS));
1803}
1804
1805/* Floating point comparisons produce an integer result.
1806 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1807 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1808 */
1809uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b)
1810{
1811    return -float32_eq_quiet(make_float32(a), make_float32(b), NFS);
1812}
1813
1814uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b)
1815{
1816    return -float32_le(make_float32(b), make_float32(a), NFS);
1817}
1818
1819uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b)
1820{
1821    return -float32_lt(make_float32(b), make_float32(a), NFS);
1822}
1823
1824uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
1825{
1826    float32 f0 = float32_abs(make_float32(a));
1827    float32 f1 = float32_abs(make_float32(b));
1828    return -float32_le(f1, f0, NFS);
1829}
1830
1831uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
1832{
1833    float32 f0 = float32_abs(make_float32(a));
1834    float32 f1 = float32_abs(make_float32(b));
1835    return -float32_lt(f1, f0, NFS);
1836}
1837
1838#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1839
1840void HELPER(neon_qunzip8)(uint32_t rd, uint32_t rm)
1841{
1842    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1843    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1844    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1845    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1846    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1847        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1848        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1849        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1850    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1851        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1852        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1853        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1854    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1855        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1856        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1857        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1858    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1859        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1860        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1861        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1862    env->vfp.regs[rm] = make_float64(m0);
1863    env->vfp.regs[rm + 1] = make_float64(m1);
1864    env->vfp.regs[rd] = make_float64(d0);
1865    env->vfp.regs[rd + 1] = make_float64(d1);
1866}
1867
1868void HELPER(neon_qunzip16)(uint32_t rd, uint32_t rm)
1869{
1870    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1871    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1872    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1873    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1874    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1875        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1876    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1877        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1878    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1879        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1880    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1881        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1882    env->vfp.regs[rm] = make_float64(m0);
1883    env->vfp.regs[rm + 1] = make_float64(m1);
1884    env->vfp.regs[rd] = make_float64(d0);
1885    env->vfp.regs[rd + 1] = make_float64(d1);
1886}
1887
1888void HELPER(neon_qunzip32)(uint32_t rd, uint32_t rm)
1889{
1890    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1891    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1892    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1893    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1894    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1895    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1896    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1897    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1898    env->vfp.regs[rm] = make_float64(m0);
1899    env->vfp.regs[rm + 1] = make_float64(m1);
1900    env->vfp.regs[rd] = make_float64(d0);
1901    env->vfp.regs[rd + 1] = make_float64(d1);
1902}
1903
1904void HELPER(neon_unzip8)(uint32_t rd, uint32_t rm)
1905{
1906    uint64_t zm = float64_val(env->vfp.regs[rm]);
1907    uint64_t zd = float64_val(env->vfp.regs[rd]);
1908    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1909        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1910        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1911        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1912    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1913        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1914        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1915        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1916    env->vfp.regs[rm] = make_float64(m0);
1917    env->vfp.regs[rd] = make_float64(d0);
1918}
1919
1920void HELPER(neon_unzip16)(uint32_t rd, uint32_t rm)
1921{
1922    uint64_t zm = float64_val(env->vfp.regs[rm]);
1923    uint64_t zd = float64_val(env->vfp.regs[rd]);
1924    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1925        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1926    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1927        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1928    env->vfp.regs[rm] = make_float64(m0);
1929    env->vfp.regs[rd] = make_float64(d0);
1930}
1931
1932void HELPER(neon_qzip8)(uint32_t rd, uint32_t rm)
1933{
1934    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1935    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1936    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1937    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1938    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1939        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1940        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1941        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1942    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1943        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1944        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1945        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1946    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1947        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1948        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1949        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1950    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1951        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1952        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1953        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1954    env->vfp.regs[rm] = make_float64(m0);
1955    env->vfp.regs[rm + 1] = make_float64(m1);
1956    env->vfp.regs[rd] = make_float64(d0);
1957    env->vfp.regs[rd + 1] = make_float64(d1);
1958}
1959
1960void HELPER(neon_qzip16)(uint32_t rd, uint32_t rm)
1961{
1962    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1963    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1964    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1965    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1966    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1967        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1968    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1969        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1970    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1971        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1972    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1973        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1974    env->vfp.regs[rm] = make_float64(m0);
1975    env->vfp.regs[rm + 1] = make_float64(m1);
1976    env->vfp.regs[rd] = make_float64(d0);
1977    env->vfp.regs[rd + 1] = make_float64(d1);
1978}
1979
1980void HELPER(neon_qzip32)(uint32_t rd, uint32_t rm)
1981{
1982    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1983    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1984    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1985    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1986    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1987    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1988    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1989    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1990    env->vfp.regs[rm] = make_float64(m0);
1991    env->vfp.regs[rm + 1] = make_float64(m1);
1992    env->vfp.regs[rd] = make_float64(d0);
1993    env->vfp.regs[rd + 1] = make_float64(d1);
1994}
1995
1996void HELPER(neon_zip8)(uint32_t rd, uint32_t rm)
1997{
1998    uint64_t zm = float64_val(env->vfp.regs[rm]);
1999    uint64_t zd = float64_val(env->vfp.regs[rd]);
2000    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
2001        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
2002        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2003        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
2004    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
2005        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
2006        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
2007        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2008    env->vfp.regs[rm] = make_float64(m0);
2009    env->vfp.regs[rd] = make_float64(d0);
2010}
2011
2012void HELPER(neon_zip16)(uint32_t rd, uint32_t rm)
2013{
2014    uint64_t zm = float64_val(env->vfp.regs[rm]);
2015    uint64_t zd = float64_val(env->vfp.regs[rd]);
2016    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
2017        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
2018    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
2019        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2020    env->vfp.regs[rm] = make_float64(m0);
2021    env->vfp.regs[rd] = make_float64(d0);
2022}
2023