1/* -*- mode: C; c-basic-offset: 3; -*- */
2
3/*--------------------------------------------------------------------*/
4/*--- begin                                     guest_arm64_toIR.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of Valgrind, a dynamic binary instrumentation
9   framework.
10
11   Copyright (C) 2013-2017 OpenWorks
12      info@open-works.net
13
14   This program is free software; you can redistribute it and/or
15   modify it under the terms of the GNU General Public License as
16   published by the Free Software Foundation; either version 2 of the
17   License, or (at your option) any later version.
18
19   This program is distributed in the hope that it will be useful, but
20   WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22   General Public License for more details.
23
24   You should have received a copy of the GNU General Public License
25   along with this program; if not, write to the Free Software
26   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27   02110-1301, USA.
28
29   The GNU General Public License is contained in the file COPYING.
30*/
31
32/* KNOWN LIMITATIONS 2014-Nov-16
33
34   * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
35
36     Also FP comparison "unordered" .. is implemented as normal FP
37     comparison.
38
39     Both should be fixed.  They behave incorrectly in the presence of
40     NaNs.
41
42     FMULX is treated the same as FMUL.  That's also not correct.
43
44   * Floating multiply-add (etc) insns.  Are split into a multiply and
45     an add, and so suffer double rounding and hence sometimes the
46     least significant mantissa bit is incorrect.  Fix: use the IR
47     multiply-add IROps instead.
48
49   * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
50     handling for the "ties" case.  FRINTX might be dubious too.
51
52   * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
53     just rounds to nearest.
54*/
55
56/* "Special" instructions.
57
58   This instruction decoder can decode four special instructions
59   which mean nothing natively (are no-ops as far as regs/mem are
60   concerned) but have meaning for supporting Valgrind.  A special
61   instruction is flagged by a 16-byte preamble:
62
63      93CC0D8C 93CC358C 93CCCD8C 93CCF58C
64      (ror x12, x12, #3;   ror x12, x12, #13
65       ror x12, x12, #51;  ror x12, x12, #61)
66
67   Following that, one of the following 3 are allowed
68   (standard interpretation in parentheses):
69
70      AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
71      AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
72      AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
73      AA090129 (orr x9,x9,x9)      IR injection
74
75   Any other bytes following the 16-byte preamble are illegal and
76   constitute a failure in instruction decoding.  This all assumes
77   that the preamble will never occur except in specific code
78   fragments designed for Valgrind to catch.
79*/
80
81/* Translates ARM64 code to IR. */
82
83#include "libvex_basictypes.h"
84#include "libvex_ir.h"
85#include "libvex.h"
86#include "libvex_guest_arm64.h"
87
88#include "main_util.h"
89#include "main_globals.h"
90#include "guest_generic_bb_to_IR.h"
91#include "guest_arm64_defs.h"
92
93
94/*------------------------------------------------------------*/
95/*--- Globals                                              ---*/
96/*------------------------------------------------------------*/
97
98/* These are set at the start of the translation of a instruction, so
99   that we don't have to pass them around endlessly.  CONST means does
100   not change during translation of the instruction.
101*/
102
103/* CONST: what is the host's endianness?  We need to know this in
104   order to do sub-register accesses to the SIMD/FP registers
105   correctly. */
106static VexEndness host_endness;
107
108/* CONST: The guest address for the instruction currently being
109   translated.  */
110static Addr64 guest_PC_curr_instr;
111
112/* MOD: The IRSB* into which we're generating code. */
113static IRSB* irsb;
114
115
116/*------------------------------------------------------------*/
117/*--- Debugging output                                     ---*/
118/*------------------------------------------------------------*/
119
120#define DIP(format, args...)           \
121   if (vex_traceflags & VEX_TRACE_FE)  \
122      vex_printf(format, ## args)
123
124#define DIS(buf, format, args...)      \
125   if (vex_traceflags & VEX_TRACE_FE)  \
126      vex_sprintf(buf, format, ## args)
127
128
129/*------------------------------------------------------------*/
130/*--- Helper bits and pieces for deconstructing the        ---*/
131/*--- arm insn stream.                                     ---*/
132/*------------------------------------------------------------*/
133
134/* Do a little-endian load of a 32-bit word, regardless of the
135   endianness of the underlying host. */
136static inline UInt getUIntLittleEndianly ( const UChar* p )
137{
138   UInt w = 0;
139   w = (w << 8) | p[3];
140   w = (w << 8) | p[2];
141   w = (w << 8) | p[1];
142   w = (w << 8) | p[0];
143   return w;
144}
145
146/* Sign extend a N-bit value up to 64 bits, by copying
147   bit N-1 into all higher positions. */
148static ULong sx_to_64 ( ULong x, UInt n )
149{
150   vassert(n > 1 && n < 64);
151   x <<= (64-n);
152   Long r = (Long)x;
153   r >>= (64-n);
154   return (ULong)r;
155}
156
157//ZZ /* Do a little-endian load of a 16-bit word, regardless of the
158//ZZ    endianness of the underlying host. */
159//ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
160//ZZ {
161//ZZ    UShort w = 0;
162//ZZ    w = (w << 8) | p[1];
163//ZZ    w = (w << 8) | p[0];
164//ZZ    return w;
165//ZZ }
166//ZZ
167//ZZ static UInt ROR32 ( UInt x, UInt sh ) {
168//ZZ    vassert(sh >= 0 && sh < 32);
169//ZZ    if (sh == 0)
170//ZZ       return x;
171//ZZ    else
172//ZZ       return (x << (32-sh)) | (x >> sh);
173//ZZ }
174//ZZ
175//ZZ static Int popcount32 ( UInt x )
176//ZZ {
177//ZZ    Int res = 0, i;
178//ZZ    for (i = 0; i < 32; i++) {
179//ZZ       res += (x & 1);
180//ZZ       x >>= 1;
181//ZZ    }
182//ZZ    return res;
183//ZZ }
184//ZZ
185//ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
186//ZZ {
187//ZZ    UInt mask = 1 << ix;
188//ZZ    x &= ~mask;
189//ZZ    x |= ((b << ix) & mask);
190//ZZ    return x;
191//ZZ }
192
193#define BITS2(_b1,_b0)  \
194   (((_b1) << 1) | (_b0))
195
196#define BITS3(_b2,_b1,_b0)  \
197  (((_b2) << 2) | ((_b1) << 1) | (_b0))
198
199#define BITS4(_b3,_b2,_b1,_b0)  \
200   (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
201
202#define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
203   ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
204    | BITS4((_b3),(_b2),(_b1),(_b0)))
205
206#define BITS5(_b4,_b3,_b2,_b1,_b0)  \
207   (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
208#define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
209   (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
210#define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
211   (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
212
213#define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
214   (((_b8) << 8)  \
215    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
216
217#define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
218   (((_b9) << 9) | ((_b8) << 8)  \
219    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
220
221#define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
222   (((_b10) << 10)  \
223    | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
224
225#define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
226   (((_b11) << 11)  \
227    | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
228
229#define X00 BITS2(0,0)
230#define X01 BITS2(0,1)
231#define X10 BITS2(1,0)
232#define X11 BITS2(1,1)
233
234// produces _uint[_bMax:_bMin]
235#define SLICE_UInt(_uint,_bMax,_bMin)  \
236   (( ((UInt)(_uint)) >> (_bMin))  \
237    & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
238
239
240/*------------------------------------------------------------*/
241/*--- Helper bits and pieces for creating IR fragments.    ---*/
242/*------------------------------------------------------------*/
243
244static IRExpr* mkV128 ( UShort w )
245{
246   return IRExpr_Const(IRConst_V128(w));
247}
248
249static IRExpr* mkU64 ( ULong i )
250{
251   return IRExpr_Const(IRConst_U64(i));
252}
253
254static IRExpr* mkU32 ( UInt i )
255{
256   return IRExpr_Const(IRConst_U32(i));
257}
258
259static IRExpr* mkU16 ( UInt i )
260{
261   vassert(i < 65536);
262   return IRExpr_Const(IRConst_U16(i));
263}
264
265static IRExpr* mkU8 ( UInt i )
266{
267   vassert(i < 256);
268   return IRExpr_Const(IRConst_U8( (UChar)i ));
269}
270
271static IRExpr* mkexpr ( IRTemp tmp )
272{
273   return IRExpr_RdTmp(tmp);
274}
275
276static IRExpr* unop ( IROp op, IRExpr* a )
277{
278   return IRExpr_Unop(op, a);
279}
280
281static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
282{
283   return IRExpr_Binop(op, a1, a2);
284}
285
286static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
287{
288   return IRExpr_Triop(op, a1, a2, a3);
289}
290
291static IRExpr* loadLE ( IRType ty, IRExpr* addr )
292{
293   return IRExpr_Load(Iend_LE, ty, addr);
294}
295
296/* Add a statement to the list held by "irbb". */
297static void stmt ( IRStmt* st )
298{
299   addStmtToIRSB( irsb, st );
300}
301
302static void assign ( IRTemp dst, IRExpr* e )
303{
304   stmt( IRStmt_WrTmp(dst, e) );
305}
306
307static void storeLE ( IRExpr* addr, IRExpr* data )
308{
309   stmt( IRStmt_Store(Iend_LE, addr, data) );
310}
311
312//ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
313//ZZ {
314//ZZ    if (guardT == IRTemp_INVALID) {
315//ZZ       /* unconditional */
316//ZZ       storeLE(addr, data);
317//ZZ    } else {
318//ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
319//ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
320//ZZ    }
321//ZZ }
322//ZZ
323//ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
324//ZZ                             IRExpr* addr, IRExpr* alt,
325//ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
326//ZZ {
327//ZZ    if (guardT == IRTemp_INVALID) {
328//ZZ       /* unconditional */
329//ZZ       IRExpr* loaded = NULL;
330//ZZ       switch (cvt) {
331//ZZ          case ILGop_Ident32:
332//ZZ             loaded = loadLE(Ity_I32, addr); break;
333//ZZ          case ILGop_8Uto32:
334//ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
335//ZZ          case ILGop_8Sto32:
336//ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
337//ZZ          case ILGop_16Uto32:
338//ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
339//ZZ          case ILGop_16Sto32:
340//ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
341//ZZ          default:
342//ZZ             vassert(0);
343//ZZ       }
344//ZZ       vassert(loaded != NULL);
345//ZZ       assign(dst, loaded);
346//ZZ    } else {
347//ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
348//ZZ          loaded data before putting the data in 'dst'.  If the load
349//ZZ          does not take place, 'alt' is placed directly in 'dst'. */
350//ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
351//ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
352//ZZ    }
353//ZZ }
354
355/* Generate a new temporary of the given type. */
356static IRTemp newTemp ( IRType ty )
357{
358   vassert(isPlausibleIRType(ty));
359   return newIRTemp( irsb->tyenv, ty );
360}
361
362/* This is used in many places, so the brevity is an advantage. */
363static IRTemp newTempV128(void)
364{
365   return newTemp(Ity_V128);
366}
367
368/* Initialise V128 temporaries en masse. */
369static
370void newTempsV128_2(IRTemp* t1, IRTemp* t2)
371{
372   vassert(t1 && *t1 == IRTemp_INVALID);
373   vassert(t2 && *t2 == IRTemp_INVALID);
374   *t1 = newTempV128();
375   *t2 = newTempV128();
376}
377
378static
379void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
380{
381   vassert(t1 && *t1 == IRTemp_INVALID);
382   vassert(t2 && *t2 == IRTemp_INVALID);
383   vassert(t3 && *t3 == IRTemp_INVALID);
384   *t1 = newTempV128();
385   *t2 = newTempV128();
386   *t3 = newTempV128();
387}
388
389static
390void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
391{
392   vassert(t1 && *t1 == IRTemp_INVALID);
393   vassert(t2 && *t2 == IRTemp_INVALID);
394   vassert(t3 && *t3 == IRTemp_INVALID);
395   vassert(t4 && *t4 == IRTemp_INVALID);
396   *t1 = newTempV128();
397   *t2 = newTempV128();
398   *t3 = newTempV128();
399   *t4 = newTempV128();
400}
401
402static
403void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
404                    IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
405{
406   vassert(t1 && *t1 == IRTemp_INVALID);
407   vassert(t2 && *t2 == IRTemp_INVALID);
408   vassert(t3 && *t3 == IRTemp_INVALID);
409   vassert(t4 && *t4 == IRTemp_INVALID);
410   vassert(t5 && *t5 == IRTemp_INVALID);
411   vassert(t6 && *t6 == IRTemp_INVALID);
412   vassert(t7 && *t7 == IRTemp_INVALID);
413   *t1 = newTempV128();
414   *t2 = newTempV128();
415   *t3 = newTempV128();
416   *t4 = newTempV128();
417   *t5 = newTempV128();
418   *t6 = newTempV128();
419   *t7 = newTempV128();
420}
421
422//ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
423//ZZ    IRRoundingMode. */
424//ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
425//ZZ {
426//ZZ    return mkU32(Irrm_NEAREST);
427//ZZ }
428//ZZ
429//ZZ /* Generate an expression for SRC rotated right by ROT. */
430//ZZ static IRExpr* genROR32( IRTemp src, Int rot )
431//ZZ {
432//ZZ    vassert(rot >= 0 && rot < 32);
433//ZZ    if (rot == 0)
434//ZZ       return mkexpr(src);
435//ZZ    return
436//ZZ       binop(Iop_Or32,
437//ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
438//ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
439//ZZ }
440//ZZ
441//ZZ static IRExpr* mkU128 ( ULong i )
442//ZZ {
443//ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
444//ZZ }
445//ZZ
446//ZZ /* Generate a 4-aligned version of the given expression if
447//ZZ    the given condition is true.  Else return it unchanged. */
448//ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
449//ZZ {
450//ZZ    if (b)
451//ZZ       return binop(Iop_And32, e, mkU32(~3));
452//ZZ    else
453//ZZ       return e;
454//ZZ }
455
456/* Other IR construction helpers. */
457static IROp mkAND ( IRType ty ) {
458   switch (ty) {
459      case Ity_I32: return Iop_And32;
460      case Ity_I64: return Iop_And64;
461      default: vpanic("mkAND");
462   }
463}
464
465static IROp mkOR ( IRType ty ) {
466   switch (ty) {
467      case Ity_I32: return Iop_Or32;
468      case Ity_I64: return Iop_Or64;
469      default: vpanic("mkOR");
470   }
471}
472
473static IROp mkXOR ( IRType ty ) {
474   switch (ty) {
475      case Ity_I32: return Iop_Xor32;
476      case Ity_I64: return Iop_Xor64;
477      default: vpanic("mkXOR");
478   }
479}
480
481static IROp mkSHL ( IRType ty ) {
482   switch (ty) {
483      case Ity_I32: return Iop_Shl32;
484      case Ity_I64: return Iop_Shl64;
485      default: vpanic("mkSHL");
486   }
487}
488
489static IROp mkSHR ( IRType ty ) {
490   switch (ty) {
491      case Ity_I32: return Iop_Shr32;
492      case Ity_I64: return Iop_Shr64;
493      default: vpanic("mkSHR");
494   }
495}
496
497static IROp mkSAR ( IRType ty ) {
498   switch (ty) {
499      case Ity_I32: return Iop_Sar32;
500      case Ity_I64: return Iop_Sar64;
501      default: vpanic("mkSAR");
502   }
503}
504
505static IROp mkNOT ( IRType ty ) {
506   switch (ty) {
507      case Ity_I32: return Iop_Not32;
508      case Ity_I64: return Iop_Not64;
509      default: vpanic("mkNOT");
510   }
511}
512
513static IROp mkADD ( IRType ty ) {
514   switch (ty) {
515      case Ity_I32: return Iop_Add32;
516      case Ity_I64: return Iop_Add64;
517      default: vpanic("mkADD");
518   }
519}
520
521static IROp mkSUB ( IRType ty ) {
522   switch (ty) {
523      case Ity_I32: return Iop_Sub32;
524      case Ity_I64: return Iop_Sub64;
525      default: vpanic("mkSUB");
526   }
527}
528
529static IROp mkADDF ( IRType ty ) {
530   switch (ty) {
531      case Ity_F32: return Iop_AddF32;
532      case Ity_F64: return Iop_AddF64;
533      default: vpanic("mkADDF");
534   }
535}
536
537static IROp mkSUBF ( IRType ty ) {
538   switch (ty) {
539      case Ity_F32: return Iop_SubF32;
540      case Ity_F64: return Iop_SubF64;
541      default: vpanic("mkSUBF");
542   }
543}
544
545static IROp mkMULF ( IRType ty ) {
546   switch (ty) {
547      case Ity_F32: return Iop_MulF32;
548      case Ity_F64: return Iop_MulF64;
549      default: vpanic("mkMULF");
550   }
551}
552
553static IROp mkDIVF ( IRType ty ) {
554   switch (ty) {
555      case Ity_F32: return Iop_DivF32;
556      case Ity_F64: return Iop_DivF64;
557      default: vpanic("mkMULF");
558   }
559}
560
561static IROp mkNEGF ( IRType ty ) {
562   switch (ty) {
563      case Ity_F32: return Iop_NegF32;
564      case Ity_F64: return Iop_NegF64;
565      default: vpanic("mkNEGF");
566   }
567}
568
569static IROp mkABSF ( IRType ty ) {
570   switch (ty) {
571      case Ity_F32: return Iop_AbsF32;
572      case Ity_F64: return Iop_AbsF64;
573      default: vpanic("mkNEGF");
574   }
575}
576
577static IROp mkSQRTF ( IRType ty ) {
578   switch (ty) {
579      case Ity_F32: return Iop_SqrtF32;
580      case Ity_F64: return Iop_SqrtF64;
581      default: vpanic("mkNEGF");
582   }
583}
584
585static IROp mkVecADD ( UInt size ) {
586   const IROp ops[4]
587      = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
588   vassert(size < 4);
589   return ops[size];
590}
591
592static IROp mkVecQADDU ( UInt size ) {
593   const IROp ops[4]
594      = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
595   vassert(size < 4);
596   return ops[size];
597}
598
599static IROp mkVecQADDS ( UInt size ) {
600   const IROp ops[4]
601      = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
602   vassert(size < 4);
603   return ops[size];
604}
605
606static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
607   const IROp ops[4]
608      = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
609          Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
610   vassert(size < 4);
611   return ops[size];
612}
613
614static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
615   const IROp ops[4]
616      = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
617          Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
618   vassert(size < 4);
619   return ops[size];
620}
621
622static IROp mkVecSUB ( UInt size ) {
623   const IROp ops[4]
624      = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
625   vassert(size < 4);
626   return ops[size];
627}
628
629static IROp mkVecQSUBU ( UInt size ) {
630   const IROp ops[4]
631      = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
632   vassert(size < 4);
633   return ops[size];
634}
635
636static IROp mkVecQSUBS ( UInt size ) {
637   const IROp ops[4]
638      = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
639   vassert(size < 4);
640   return ops[size];
641}
642
643static IROp mkVecSARN ( UInt size ) {
644   const IROp ops[4]
645      = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
646   vassert(size < 4);
647   return ops[size];
648}
649
650static IROp mkVecSHRN ( UInt size ) {
651   const IROp ops[4]
652      = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
653   vassert(size < 4);
654   return ops[size];
655}
656
657static IROp mkVecSHLN ( UInt size ) {
658   const IROp ops[4]
659      = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
660   vassert(size < 4);
661   return ops[size];
662}
663
664static IROp mkVecCATEVENLANES ( UInt size ) {
665   const IROp ops[4]
666      = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
667          Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
668   vassert(size < 4);
669   return ops[size];
670}
671
672static IROp mkVecCATODDLANES ( UInt size ) {
673   const IROp ops[4]
674      = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
675          Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
676   vassert(size < 4);
677   return ops[size];
678}
679
680static IROp mkVecINTERLEAVELO ( UInt size ) {
681   const IROp ops[4]
682      = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
683          Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
684   vassert(size < 4);
685   return ops[size];
686}
687
688static IROp mkVecINTERLEAVEHI ( UInt size ) {
689   const IROp ops[4]
690      = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
691          Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
692   vassert(size < 4);
693   return ops[size];
694}
695
696static IROp mkVecMAXU ( UInt size ) {
697   const IROp ops[4]
698      = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
699   vassert(size < 4);
700   return ops[size];
701}
702
703static IROp mkVecMAXS ( UInt size ) {
704   const IROp ops[4]
705      = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
706   vassert(size < 4);
707   return ops[size];
708}
709
710static IROp mkVecMINU ( UInt size ) {
711   const IROp ops[4]
712      = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
713   vassert(size < 4);
714   return ops[size];
715}
716
717static IROp mkVecMINS ( UInt size ) {
718   const IROp ops[4]
719      = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
720   vassert(size < 4);
721   return ops[size];
722}
723
724static IROp mkVecMUL ( UInt size ) {
725   const IROp ops[4]
726      = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
727   vassert(size < 3);
728   return ops[size];
729}
730
731static IROp mkVecMULLU ( UInt sizeNarrow ) {
732   const IROp ops[4]
733      = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
734   vassert(sizeNarrow < 3);
735   return ops[sizeNarrow];
736}
737
738static IROp mkVecMULLS ( UInt sizeNarrow ) {
739   const IROp ops[4]
740      = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
741   vassert(sizeNarrow < 3);
742   return ops[sizeNarrow];
743}
744
745static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
746   const IROp ops[4]
747      = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
748   vassert(sizeNarrow < 3);
749   return ops[sizeNarrow];
750}
751
752static IROp mkVecCMPEQ ( UInt size ) {
753   const IROp ops[4]
754      = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
755   vassert(size < 4);
756   return ops[size];
757}
758
759static IROp mkVecCMPGTU ( UInt size ) {
760   const IROp ops[4]
761      = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
762   vassert(size < 4);
763   return ops[size];
764}
765
766static IROp mkVecCMPGTS ( UInt size ) {
767   const IROp ops[4]
768      = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
769   vassert(size < 4);
770   return ops[size];
771}
772
773static IROp mkVecABS ( UInt size ) {
774   const IROp ops[4]
775      = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
776   vassert(size < 4);
777   return ops[size];
778}
779
780static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
781   const IROp ops[4]
782      = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
783          Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
784   vassert(size < 4);
785   return ops[size];
786}
787
788static IRExpr* mkU ( IRType ty, ULong imm ) {
789   switch (ty) {
790      case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
791      case Ity_I64: return mkU64(imm);
792      default: vpanic("mkU");
793   }
794}
795
796static IROp mkVecQDMULHIS ( UInt size ) {
797   const IROp ops[4]
798      = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
799   vassert(size < 4);
800   return ops[size];
801}
802
803static IROp mkVecQRDMULHIS ( UInt size ) {
804   const IROp ops[4]
805      = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
806   vassert(size < 4);
807   return ops[size];
808}
809
810static IROp mkVecQANDUQSH ( UInt size ) {
811   const IROp ops[4]
812      = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
813          Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
814   vassert(size < 4);
815   return ops[size];
816}
817
818static IROp mkVecQANDSQSH ( UInt size ) {
819   const IROp ops[4]
820      = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
821          Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
822   vassert(size < 4);
823   return ops[size];
824}
825
826static IROp mkVecQANDUQRSH ( UInt size ) {
827   const IROp ops[4]
828      = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
829          Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
830   vassert(size < 4);
831   return ops[size];
832}
833
834static IROp mkVecQANDSQRSH ( UInt size ) {
835   const IROp ops[4]
836      = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
837          Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
838   vassert(size < 4);
839   return ops[size];
840}
841
842static IROp mkVecSHU ( UInt size ) {
843   const IROp ops[4]
844      = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
845   vassert(size < 4);
846   return ops[size];
847}
848
849static IROp mkVecSHS ( UInt size ) {
850   const IROp ops[4]
851      = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
852   vassert(size < 4);
853   return ops[size];
854}
855
856static IROp mkVecRSHU ( UInt size ) {
857   const IROp ops[4]
858      = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
859   vassert(size < 4);
860   return ops[size];
861}
862
863static IROp mkVecRSHS ( UInt size ) {
864   const IROp ops[4]
865      = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
866   vassert(size < 4);
867   return ops[size];
868}
869
870static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
871   const IROp ops[4]
872      = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
873          Iop_NarrowUn64to32x2, Iop_INVALID };
874   vassert(sizeNarrow < 4);
875   return ops[sizeNarrow];
876}
877
878static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
879   const IROp ops[4]
880      = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
881          Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
882   vassert(sizeNarrow < 4);
883   return ops[sizeNarrow];
884}
885
886static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
887   const IROp ops[4]
888      = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
889          Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
890   vassert(sizeNarrow < 4);
891   return ops[sizeNarrow];
892}
893
894static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
895   const IROp ops[4]
896      = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
897          Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
898   vassert(sizeNarrow < 4);
899   return ops[sizeNarrow];
900}
901
902static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
903   const IROp ops[4]
904      = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
905          Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
906   vassert(sizeNarrow < 4);
907   return ops[sizeNarrow];
908}
909
910static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
911   const IROp ops[4]
912      = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
913          Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
914   vassert(sizeNarrow < 4);
915   return ops[sizeNarrow];
916}
917
918static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
919   const IROp ops[4]
920      = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
921          Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
922   vassert(sizeNarrow < 4);
923   return ops[sizeNarrow];
924}
925
926static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
927   const IROp ops[4]
928      = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
929          Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
930   vassert(sizeNarrow < 4);
931   return ops[sizeNarrow];
932}
933
934static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
935   const IROp ops[4]
936      = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
937          Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
938   vassert(sizeNarrow < 4);
939   return ops[sizeNarrow];
940}
941
942static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
943   const IROp ops[4]
944      = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
945          Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
946   vassert(sizeNarrow < 4);
947   return ops[sizeNarrow];
948}
949
950static IROp mkVecQSHLNSATUU ( UInt size ) {
951   const IROp ops[4]
952      = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
953          Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
954   vassert(size < 4);
955   return ops[size];
956}
957
958static IROp mkVecQSHLNSATSS ( UInt size ) {
959   const IROp ops[4]
960      = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
961          Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
962   vassert(size < 4);
963   return ops[size];
964}
965
966static IROp mkVecQSHLNSATSU ( UInt size ) {
967   const IROp ops[4]
968      = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
969          Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
970   vassert(size < 4);
971   return ops[size];
972}
973
974static IROp mkVecADDF ( UInt size ) {
975   const IROp ops[4]
976      = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
977   vassert(size < 4);
978   return ops[size];
979}
980
981static IROp mkVecMAXF ( UInt size ) {
982   const IROp ops[4]
983      = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
984   vassert(size < 4);
985   return ops[size];
986}
987
988static IROp mkVecMINF ( UInt size ) {
989   const IROp ops[4]
990      = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
991   vassert(size < 4);
992   return ops[size];
993}
994
995/* Generate IR to create 'arg rotated right by imm', for sane values
996   of 'ty' and 'imm'. */
997static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
998{
999   UInt w = 0;
1000   if (ty == Ity_I64) {
1001      w = 64;
1002   } else {
1003      vassert(ty == Ity_I32);
1004      w = 32;
1005   }
1006   vassert(w != 0);
1007   vassert(imm < w);
1008   if (imm == 0) {
1009      return arg;
1010   }
1011   IRTemp res = newTemp(ty);
1012   assign(res, binop(mkOR(ty),
1013                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1014                     binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1015   return res;
1016}
1017
1018/* Generate IR to set the returned temp to either all-zeroes or
1019   all ones, as a copy of arg<imm>. */
1020static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1021{
1022   UInt w = 0;
1023   if (ty == Ity_I64) {
1024      w = 64;
1025   } else {
1026      vassert(ty == Ity_I32);
1027      w = 32;
1028   }
1029   vassert(w != 0);
1030   vassert(imm < w);
1031   IRTemp res = newTemp(ty);
1032   assign(res, binop(mkSAR(ty),
1033                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1034                     mkU8(w - 1)));
1035   return res;
1036}
1037
1038/* U-widen 8/16/32/64 bit int expr to 64. */
1039static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1040{
1041   switch (srcTy) {
1042      case Ity_I64: return e;
1043      case Ity_I32: return unop(Iop_32Uto64, e);
1044      case Ity_I16: return unop(Iop_16Uto64, e);
1045      case Ity_I8:  return unop(Iop_8Uto64, e);
1046      default: vpanic("widenUto64(arm64)");
1047   }
1048}
1049
1050/* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1051   of these combinations make sense. */
1052static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1053{
1054   switch (dstTy) {
1055      case Ity_I64: return e;
1056      case Ity_I32: return unop(Iop_64to32, e);
1057      case Ity_I16: return unop(Iop_64to16, e);
1058      case Ity_I8:  return unop(Iop_64to8, e);
1059      default: vpanic("narrowFrom64(arm64)");
1060   }
1061}
1062
1063
1064/*------------------------------------------------------------*/
1065/*--- Helpers for accessing guest registers.               ---*/
1066/*------------------------------------------------------------*/
1067
1068#define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1069#define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1070#define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1071#define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1072#define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1073#define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1074#define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1075#define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1076#define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1077#define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1078#define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1079#define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1080#define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1081#define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1082#define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1083#define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1084#define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1085#define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1086#define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1087#define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1088#define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1089#define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1090#define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1091#define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1092#define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1093#define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1094#define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1095#define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1096#define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1097#define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1098#define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1099
1100#define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1101#define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1102
1103#define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1104#define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1105#define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1106#define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1107
1108#define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1109#define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1110
1111#define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1112#define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1113#define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1114#define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1115#define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1116#define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1117#define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1118#define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1119#define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1120#define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1121#define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1122#define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1123#define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1124#define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1125#define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1126#define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1127#define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1128#define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1129#define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1130#define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1131#define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1132#define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1133#define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1134#define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1135#define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1136#define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1137#define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1138#define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1139#define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1140#define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1141#define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1142#define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1143
1144#define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1145#define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1146
1147#define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1148#define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1149
1150#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1151#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1152#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
1153
1154
1155/* ---------------- Integer registers ---------------- */
1156
1157static Int offsetIReg64 ( UInt iregNo )
1158{
1159   /* Do we care about endianness here?  We do if sub-parts of integer
1160      registers are accessed. */
1161   switch (iregNo) {
1162      case 0:  return OFFB_X0;
1163      case 1:  return OFFB_X1;
1164      case 2:  return OFFB_X2;
1165      case 3:  return OFFB_X3;
1166      case 4:  return OFFB_X4;
1167      case 5:  return OFFB_X5;
1168      case 6:  return OFFB_X6;
1169      case 7:  return OFFB_X7;
1170      case 8:  return OFFB_X8;
1171      case 9:  return OFFB_X9;
1172      case 10: return OFFB_X10;
1173      case 11: return OFFB_X11;
1174      case 12: return OFFB_X12;
1175      case 13: return OFFB_X13;
1176      case 14: return OFFB_X14;
1177      case 15: return OFFB_X15;
1178      case 16: return OFFB_X16;
1179      case 17: return OFFB_X17;
1180      case 18: return OFFB_X18;
1181      case 19: return OFFB_X19;
1182      case 20: return OFFB_X20;
1183      case 21: return OFFB_X21;
1184      case 22: return OFFB_X22;
1185      case 23: return OFFB_X23;
1186      case 24: return OFFB_X24;
1187      case 25: return OFFB_X25;
1188      case 26: return OFFB_X26;
1189      case 27: return OFFB_X27;
1190      case 28: return OFFB_X28;
1191      case 29: return OFFB_X29;
1192      case 30: return OFFB_X30;
1193      /* but not 31 */
1194      default: vassert(0);
1195   }
1196}
1197
1198static Int offsetIReg64orSP ( UInt iregNo )
1199{
1200   return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1201}
1202
1203static const HChar* nameIReg64orZR ( UInt iregNo )
1204{
1205   vassert(iregNo < 32);
1206   static const HChar* names[32]
1207      = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1208          "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1209          "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1210          "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1211   return names[iregNo];
1212}
1213
1214static const HChar* nameIReg64orSP ( UInt iregNo )
1215{
1216   if (iregNo == 31) {
1217      return "sp";
1218   }
1219   vassert(iregNo < 31);
1220   return nameIReg64orZR(iregNo);
1221}
1222
1223static IRExpr* getIReg64orSP ( UInt iregNo )
1224{
1225   vassert(iregNo < 32);
1226   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1227}
1228
1229static IRExpr* getIReg64orZR ( UInt iregNo )
1230{
1231   if (iregNo == 31) {
1232      return mkU64(0);
1233   }
1234   vassert(iregNo < 31);
1235   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1236}
1237
1238static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1239{
1240   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1241   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1242}
1243
1244static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1245{
1246   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1247   if (iregNo == 31) {
1248      return;
1249   }
1250   vassert(iregNo < 31);
1251   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1252}
1253
1254static const HChar* nameIReg32orZR ( UInt iregNo )
1255{
1256   vassert(iregNo < 32);
1257   static const HChar* names[32]
1258      = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1259          "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1260          "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1261          "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1262   return names[iregNo];
1263}
1264
1265static const HChar* nameIReg32orSP ( UInt iregNo )
1266{
1267   if (iregNo == 31) {
1268      return "wsp";
1269   }
1270   vassert(iregNo < 31);
1271   return nameIReg32orZR(iregNo);
1272}
1273
1274static IRExpr* getIReg32orSP ( UInt iregNo )
1275{
1276   vassert(iregNo < 32);
1277   return unop(Iop_64to32,
1278               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1279}
1280
1281static IRExpr* getIReg32orZR ( UInt iregNo )
1282{
1283   if (iregNo == 31) {
1284      return mkU32(0);
1285   }
1286   vassert(iregNo < 31);
1287   return unop(Iop_64to32,
1288               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1289}
1290
1291static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1292{
1293   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1294   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1295}
1296
1297static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1298{
1299   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1300   if (iregNo == 31) {
1301      return;
1302   }
1303   vassert(iregNo < 31);
1304   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1305}
1306
1307static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1308{
1309   vassert(is64 == True || is64 == False);
1310   return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1311}
1312
1313static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1314{
1315   vassert(is64 == True || is64 == False);
1316   return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1317}
1318
1319static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1320{
1321   vassert(is64 == True || is64 == False);
1322   return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1323}
1324
1325static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1326{
1327   vassert(is64 == True || is64 == False);
1328   if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1329}
1330
1331static void putPC ( IRExpr* e )
1332{
1333   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1334   stmt( IRStmt_Put(OFFB_PC, e) );
1335}
1336
1337
1338/* ---------------- Vector (Q) registers ---------------- */
1339
1340static Int offsetQReg128 ( UInt qregNo )
1341{
1342   /* We don't care about endianness at this point.  It only becomes
1343      relevant when dealing with sections of these registers.*/
1344   switch (qregNo) {
1345      case 0:  return OFFB_Q0;
1346      case 1:  return OFFB_Q1;
1347      case 2:  return OFFB_Q2;
1348      case 3:  return OFFB_Q3;
1349      case 4:  return OFFB_Q4;
1350      case 5:  return OFFB_Q5;
1351      case 6:  return OFFB_Q6;
1352      case 7:  return OFFB_Q7;
1353      case 8:  return OFFB_Q8;
1354      case 9:  return OFFB_Q9;
1355      case 10: return OFFB_Q10;
1356      case 11: return OFFB_Q11;
1357      case 12: return OFFB_Q12;
1358      case 13: return OFFB_Q13;
1359      case 14: return OFFB_Q14;
1360      case 15: return OFFB_Q15;
1361      case 16: return OFFB_Q16;
1362      case 17: return OFFB_Q17;
1363      case 18: return OFFB_Q18;
1364      case 19: return OFFB_Q19;
1365      case 20: return OFFB_Q20;
1366      case 21: return OFFB_Q21;
1367      case 22: return OFFB_Q22;
1368      case 23: return OFFB_Q23;
1369      case 24: return OFFB_Q24;
1370      case 25: return OFFB_Q25;
1371      case 26: return OFFB_Q26;
1372      case 27: return OFFB_Q27;
1373      case 28: return OFFB_Q28;
1374      case 29: return OFFB_Q29;
1375      case 30: return OFFB_Q30;
1376      case 31: return OFFB_Q31;
1377      default: vassert(0);
1378   }
1379}
1380
1381/* Write to a complete Qreg. */
1382static void putQReg128 ( UInt qregNo, IRExpr* e )
1383{
1384   vassert(qregNo < 32);
1385   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1386   stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1387}
1388
1389/* Read a complete Qreg. */
1390static IRExpr* getQReg128 ( UInt qregNo )
1391{
1392   vassert(qregNo < 32);
1393   return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1394}
1395
1396/* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1397   bit sub-parts we can choose either integer or float types, and
1398   choose float on the basis that that is the common use case and so
1399   will give least interference with Put-to-Get forwarding later
1400   on. */
1401static IRType preferredVectorSubTypeFromSize ( UInt szB )
1402{
1403   switch (szB) {
1404      case 1:  return Ity_I8;
1405      case 2:  return Ity_I16;
1406      case 4:  return Ity_I32; //Ity_F32;
1407      case 8:  return Ity_F64;
1408      case 16: return Ity_V128;
1409      default: vassert(0);
1410   }
1411}
1412
1413/* Find the offset of the laneNo'th lane of type laneTy in the given
1414   Qreg.  Since the host is little-endian, the least significant lane
1415   has the lowest offset. */
1416static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1417{
1418   vassert(host_endness == VexEndnessLE);
1419   Int base = offsetQReg128(qregNo);
1420   /* Since the host is little-endian, the least significant lane
1421      will be at the lowest address. */
1422   /* Restrict this to known types, so as to avoid silently accepting
1423      stupid types. */
1424   UInt laneSzB = 0;
1425   switch (laneTy) {
1426      case Ity_I8:                 laneSzB = 1;  break;
1427      case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1428      case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1429      case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1430      case Ity_V128:               laneSzB = 16; break;
1431      default: break;
1432   }
1433   vassert(laneSzB > 0);
1434   UInt minOff = laneNo * laneSzB;
1435   UInt maxOff = minOff + laneSzB - 1;
1436   vassert(maxOff < 16);
1437   return base + minOff;
1438}
1439
1440/* Put to the least significant lane of a Qreg. */
1441static void putQRegLO ( UInt qregNo, IRExpr* e )
1442{
1443   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1444   Int    off = offsetQRegLane(qregNo, ty, 0);
1445   switch (ty) {
1446      case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1447      case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1448         break;
1449      default:
1450         vassert(0); // Other cases are probably invalid
1451   }
1452   stmt(IRStmt_Put(off, e));
1453}
1454
1455/* Get from the least significant lane of a Qreg. */
1456static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1457{
1458   Int off = offsetQRegLane(qregNo, ty, 0);
1459   switch (ty) {
1460      case Ity_I8:
1461      case Ity_F16: case Ity_I16:
1462      case Ity_I32: case Ity_I64:
1463      case Ity_F32: case Ity_F64: case Ity_V128:
1464         break;
1465      default:
1466         vassert(0); // Other cases are ATC
1467   }
1468   return IRExpr_Get(off, ty);
1469}
1470
1471static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1472{
1473   static const HChar* namesQ[32]
1474      = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1475          "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1476          "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1477          "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1478   static const HChar* namesD[32]
1479      = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1480          "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1481          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1482          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1483   static const HChar* namesS[32]
1484      = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1485          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1486          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1487          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1488   static const HChar* namesH[32]
1489      = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1490          "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1491          "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1492          "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1493   static const HChar* namesB[32]
1494      = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1495          "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1496          "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1497          "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1498   vassert(qregNo < 32);
1499   switch (sizeofIRType(laneTy)) {
1500      case 1:  return namesB[qregNo];
1501      case 2:  return namesH[qregNo];
1502      case 4:  return namesS[qregNo];
1503      case 8:  return namesD[qregNo];
1504      case 16: return namesQ[qregNo];
1505      default: vassert(0);
1506   }
1507   /*NOTREACHED*/
1508}
1509
1510static const HChar* nameQReg128 ( UInt qregNo )
1511{
1512   return nameQRegLO(qregNo, Ity_V128);
1513}
1514
1515/* Find the offset of the most significant half (8 bytes) of the given
1516   Qreg.  This requires knowing the endianness of the host. */
1517static Int offsetQRegHI64 ( UInt qregNo )
1518{
1519   return offsetQRegLane(qregNo, Ity_I64, 1);
1520}
1521
1522static IRExpr* getQRegHI64 ( UInt qregNo )
1523{
1524   return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1525}
1526
1527static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1528{
1529   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1530   Int    off = offsetQRegHI64(qregNo);
1531   switch (ty) {
1532      case Ity_I64: case Ity_F64:
1533         break;
1534      default:
1535         vassert(0); // Other cases are plain wrong
1536   }
1537   stmt(IRStmt_Put(off, e));
1538}
1539
1540/* Put to a specified lane of a Qreg. */
1541static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1542{
1543   IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1544   Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1545   switch (laneTy) {
1546      case Ity_F64: case Ity_I64:
1547      case Ity_I32: case Ity_F32:
1548      case Ity_I16: case Ity_F16:
1549      case Ity_I8:
1550         break;
1551      default:
1552         vassert(0); // Other cases are ATC
1553   }
1554   stmt(IRStmt_Put(off, e));
1555}
1556
1557/* Get from a specified lane of a Qreg. */
1558static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1559{
1560   Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1561   switch (laneTy) {
1562      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1563      case Ity_F64: case Ity_F32: case Ity_F16:
1564         break;
1565      default:
1566         vassert(0); // Other cases are ATC
1567   }
1568   return IRExpr_Get(off, laneTy);
1569}
1570
1571
1572//ZZ /* ---------------- Misc registers ---------------- */
1573//ZZ
1574//ZZ static void putMiscReg32 ( UInt    gsoffset,
1575//ZZ                            IRExpr* e, /* :: Ity_I32 */
1576//ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1577//ZZ {
1578//ZZ    switch (gsoffset) {
1579//ZZ       case OFFB_FPSCR:   break;
1580//ZZ       case OFFB_QFLAG32: break;
1581//ZZ       case OFFB_GEFLAG0: break;
1582//ZZ       case OFFB_GEFLAG1: break;
1583//ZZ       case OFFB_GEFLAG2: break;
1584//ZZ       case OFFB_GEFLAG3: break;
1585//ZZ       default: vassert(0); /* awaiting more cases */
1586//ZZ    }
1587//ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1588//ZZ
1589//ZZ    if (guardT == IRTemp_INVALID) {
1590//ZZ       /* unconditional write */
1591//ZZ       stmt(IRStmt_Put(gsoffset, e));
1592//ZZ    } else {
1593//ZZ       stmt(IRStmt_Put(
1594//ZZ          gsoffset,
1595//ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1596//ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1597//ZZ       ));
1598//ZZ    }
1599//ZZ }
1600//ZZ
1601//ZZ static IRTemp get_ITSTATE ( void )
1602//ZZ {
1603//ZZ    ASSERT_IS_THUMB;
1604//ZZ    IRTemp t = newTemp(Ity_I32);
1605//ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1606//ZZ    return t;
1607//ZZ }
1608//ZZ
1609//ZZ static void put_ITSTATE ( IRTemp t )
1610//ZZ {
1611//ZZ    ASSERT_IS_THUMB;
1612//ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1613//ZZ }
1614//ZZ
1615//ZZ static IRTemp get_QFLAG32 ( void )
1616//ZZ {
1617//ZZ    IRTemp t = newTemp(Ity_I32);
1618//ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1619//ZZ    return t;
1620//ZZ }
1621//ZZ
1622//ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1623//ZZ {
1624//ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1625//ZZ }
1626//ZZ
1627//ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1628//ZZ    Status Register) to indicate that overflow or saturation occurred.
1629//ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1630//ZZ    value to indicate saturation. */
1631//ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1632//ZZ {
1633//ZZ    IRTemp old = get_QFLAG32();
1634//ZZ    IRTemp nyu = newTemp(Ity_I32);
1635//ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1636//ZZ    put_QFLAG32(nyu, condT);
1637//ZZ }
1638
1639
1640/* ---------------- FPCR stuff ---------------- */
1641
1642/* Generate IR to get hold of the rounding mode bits in FPCR, and
1643   convert them to IR format.  Bind the final result to the
1644   returned temp. */
1645static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1646{
1647   /* The ARMvfp encoding for rounding mode bits is:
1648         00  to nearest
1649         01  to +infinity
1650         10  to -infinity
1651         11  to zero
1652      We need to convert that to the IR encoding:
1653         00  to nearest (the default)
1654         10  to +infinity
1655         01  to -infinity
1656         11  to zero
1657      Which can be done by swapping bits 0 and 1.
1658      The rmode bits are at 23:22 in FPSCR.
1659   */
1660   IRTemp armEncd = newTemp(Ity_I32);
1661   IRTemp swapped = newTemp(Ity_I32);
1662   /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1663      we don't zero out bits 24 and above, since the assignment to
1664      'swapped' will mask them out anyway. */
1665   assign(armEncd,
1666          binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1667   /* Now swap them. */
1668   assign(swapped,
1669          binop(Iop_Or32,
1670                binop(Iop_And32,
1671                      binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1672                      mkU32(2)),
1673                binop(Iop_And32,
1674                      binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1675                      mkU32(1))
1676         ));
1677   return swapped;
1678}
1679
1680
1681/*------------------------------------------------------------*/
1682/*--- Helpers for flag handling and conditional insns      ---*/
1683/*------------------------------------------------------------*/
1684
1685static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1686{
1687   switch (cond) {
1688      case ARM64CondEQ:  return "eq";
1689      case ARM64CondNE:  return "ne";
1690      case ARM64CondCS:  return "cs";  // or 'hs'
1691      case ARM64CondCC:  return "cc";  // or 'lo'
1692      case ARM64CondMI:  return "mi";
1693      case ARM64CondPL:  return "pl";
1694      case ARM64CondVS:  return "vs";
1695      case ARM64CondVC:  return "vc";
1696      case ARM64CondHI:  return "hi";
1697      case ARM64CondLS:  return "ls";
1698      case ARM64CondGE:  return "ge";
1699      case ARM64CondLT:  return "lt";
1700      case ARM64CondGT:  return "gt";
1701      case ARM64CondLE:  return "le";
1702      case ARM64CondAL:  return "al";
1703      case ARM64CondNV:  return "nv";
1704      default: vpanic("name_ARM64Condcode");
1705   }
1706}
1707
1708/* and a handy shorthand for it */
1709static const HChar* nameCC ( ARM64Condcode cond ) {
1710   return nameARM64Condcode(cond);
1711}
1712
1713
1714/* Build IR to calculate some particular condition from stored
1715   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1716   Ity_I64, suitable for narrowing.  Although the return type is
1717   Ity_I64, the returned value is either 0 or 1.  'cond' must be
1718   :: Ity_I64 and must denote the condition to compute in
1719   bits 7:4, and be zero everywhere else.
1720*/
1721static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1722{
1723   vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1724   /* And 'cond' had better produce a value in which only bits 7:4 are
1725      nonzero.  However, obviously we can't assert for that. */
1726
1727   /* So what we're constructing for the first argument is
1728      "(cond << 4) | stored-operation".
1729      However, as per comments above, 'cond' must be supplied
1730      pre-shifted to this function.
1731
1732      This pairing scheme requires that the ARM64_CC_OP_ values all fit
1733      in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1734      8 bits of the first argument. */
1735   IRExpr** args
1736      = mkIRExprVec_4(
1737           binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1738           IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1739           IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1740           IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1741        );
1742   IRExpr* call
1743      = mkIRExprCCall(
1744           Ity_I64,
1745           0/*regparm*/,
1746           "arm64g_calculate_condition", &arm64g_calculate_condition,
1747           args
1748        );
1749
1750   /* Exclude the requested condition, OP and NDEP from definedness
1751      checking.  We're only interested in DEP1 and DEP2. */
1752   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1753   return call;
1754}
1755
1756
1757/* Build IR to calculate some particular condition from stored
1758   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1759   Ity_I64, suitable for narrowing.  Although the return type is
1760   Ity_I64, the returned value is either 0 or 1.
1761*/
1762static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1763{
1764  /* First arg is "(cond << 4) | condition".  This requires that the
1765     ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1766     (COND, OP) pair in the lowest 8 bits of the first argument. */
1767   vassert(cond >= 0 && cond <= 15);
1768   return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1769}
1770
1771
1772/* Build IR to calculate just the carry flag from stored
1773   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1774   Ity_I64. */
1775static IRExpr* mk_arm64g_calculate_flag_c ( void )
1776{
1777   IRExpr** args
1778      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1779                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1780                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1781                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1782   IRExpr* call
1783      = mkIRExprCCall(
1784           Ity_I64,
1785           0/*regparm*/,
1786           "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1787           args
1788        );
1789   /* Exclude OP and NDEP from definedness checking.  We're only
1790      interested in DEP1 and DEP2. */
1791   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1792   return call;
1793}
1794
1795
1796//ZZ /* Build IR to calculate just the overflow flag from stored
1797//ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1798//ZZ    Ity_I32. */
1799//ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1800//ZZ {
1801//ZZ    IRExpr** args
1802//ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1803//ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1804//ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1805//ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1806//ZZ    IRExpr* call
1807//ZZ       = mkIRExprCCall(
1808//ZZ            Ity_I32,
1809//ZZ            0/*regparm*/,
1810//ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1811//ZZ            args
1812//ZZ         );
1813//ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1814//ZZ       interested in DEP1 and DEP2. */
1815//ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1816//ZZ    return call;
1817//ZZ }
1818
1819
1820/* Build IR to calculate N Z C V in bits 31:28 of the
1821   returned word. */
1822static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1823{
1824   IRExpr** args
1825      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1826                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1827                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1828                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1829   IRExpr* call
1830      = mkIRExprCCall(
1831           Ity_I64,
1832           0/*regparm*/,
1833           "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1834           args
1835        );
1836   /* Exclude OP and NDEP from definedness checking.  We're only
1837      interested in DEP1 and DEP2. */
1838   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1839   return call;
1840}
1841
1842
1843/* Build IR to set the flags thunk, in the most general case. */
1844static
1845void setFlags_D1_D2_ND ( UInt cc_op,
1846                         IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1847{
1848   vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1849   vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1850   vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1851   vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1852   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1853   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1854   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1855   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1856}
1857
1858/* Build IR to set the flags thunk after ADD or SUB. */
1859static
1860void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1861{
1862   IRTemp argL64 = IRTemp_INVALID;
1863   IRTemp argR64 = IRTemp_INVALID;
1864   IRTemp z64    = newTemp(Ity_I64);
1865   if (is64) {
1866      argL64 = argL;
1867      argR64 = argR;
1868   } else {
1869      argL64 = newTemp(Ity_I64);
1870      argR64 = newTemp(Ity_I64);
1871      assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1872      assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1873   }
1874   assign(z64, mkU64(0));
1875   UInt cc_op = ARM64G_CC_OP_NUMBER;
1876   /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1877   else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1878   else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1879   else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1880   else                      { vassert(0); }
1881   setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1882}
1883
1884/* Build IR to set the flags thunk after ADC or SBC. */
1885static
1886void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1887                        IRTemp argL, IRTemp argR, IRTemp oldC )
1888{
1889   IRTemp argL64 = IRTemp_INVALID;
1890   IRTemp argR64 = IRTemp_INVALID;
1891   IRTemp oldC64 = IRTemp_INVALID;
1892   if (is64) {
1893      argL64 = argL;
1894      argR64 = argR;
1895      oldC64 = oldC;
1896   } else {
1897      argL64 = newTemp(Ity_I64);
1898      argR64 = newTemp(Ity_I64);
1899      oldC64 = newTemp(Ity_I64);
1900      assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1901      assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1902      assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1903   }
1904   UInt cc_op = ARM64G_CC_OP_NUMBER;
1905   /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1906   else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1907   else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1908   else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1909   else                      { vassert(0); }
1910   setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1911}
1912
1913/* Build IR to set the flags thunk after ADD or SUB, if the given
1914   condition evaluates to True at run time.  If not, the flags are set
1915   to the specified NZCV value. */
1916static
1917void setFlags_ADD_SUB_conditionally (
1918        Bool is64, Bool isSUB,
1919        IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1920     )
1921{
1922   /* Generate IR as follows:
1923        CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1924        CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1925        CC_DEP2 = ITE(cond, argR64, 0)
1926        CC_NDEP = 0
1927   */
1928
1929   IRTemp z64 = newTemp(Ity_I64);
1930   assign(z64, mkU64(0));
1931
1932   /* Establish the operation and operands for the True case. */
1933   IRTemp t_dep1 = IRTemp_INVALID;
1934   IRTemp t_dep2 = IRTemp_INVALID;
1935   UInt   t_op   = ARM64G_CC_OP_NUMBER;
1936   /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1937   else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1938   else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1939   else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1940   else                      { vassert(0); }
1941   /* */
1942   if (is64) {
1943      t_dep1 = argL;
1944      t_dep2 = argR;
1945   } else {
1946      t_dep1 = newTemp(Ity_I64);
1947      t_dep2 = newTemp(Ity_I64);
1948      assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1949      assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1950   }
1951
1952   /* Establish the operation and operands for the False case. */
1953   IRTemp f_dep1 = newTemp(Ity_I64);
1954   IRTemp f_dep2 = z64;
1955   UInt   f_op   = ARM64G_CC_OP_COPY;
1956   assign(f_dep1, mkU64(nzcv << 28));
1957
1958   /* Final thunk values */
1959   IRTemp dep1 = newTemp(Ity_I64);
1960   IRTemp dep2 = newTemp(Ity_I64);
1961   IRTemp op   = newTemp(Ity_I64);
1962
1963   assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
1964   assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
1965   assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
1966
1967   /* finally .. */
1968   stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
1969   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
1970   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
1971   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
1972}
1973
1974/* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
1975static
1976void setFlags_LOGIC ( Bool is64, IRTemp res )
1977{
1978   IRTemp res64 = IRTemp_INVALID;
1979   IRTemp z64   = newTemp(Ity_I64);
1980   UInt   cc_op = ARM64G_CC_OP_NUMBER;
1981   if (is64) {
1982      res64 = res;
1983      cc_op = ARM64G_CC_OP_LOGIC64;
1984   } else {
1985      res64 = newTemp(Ity_I64);
1986      assign(res64, unop(Iop_32Uto64, mkexpr(res)));
1987      cc_op = ARM64G_CC_OP_LOGIC32;
1988   }
1989   assign(z64, mkU64(0));
1990   setFlags_D1_D2_ND(cc_op, res64, z64, z64);
1991}
1992
1993/* Build IR to set the flags thunk to a given NZCV value.  NZCV is
1994   located in bits 31:28 of the supplied value. */
1995static
1996void setFlags_COPY ( IRTemp nzcv_28x0 )
1997{
1998   IRTemp z64 = newTemp(Ity_I64);
1999   assign(z64, mkU64(0));
2000   setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2001}
2002
2003
2004//ZZ /* Minor variant of the above that sets NDEP to zero (if it
2005//ZZ    sets it at all) */
2006//ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2007//ZZ                              IRTemp t_dep2,
2008//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2009//ZZ {
2010//ZZ    IRTemp z32 = newTemp(Ity_I32);
2011//ZZ    assign( z32, mkU32(0) );
2012//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2013//ZZ }
2014//ZZ
2015//ZZ
2016//ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2017//ZZ    sets it at all) */
2018//ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2019//ZZ                              IRTemp t_ndep,
2020//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2021//ZZ {
2022//ZZ    IRTemp z32 = newTemp(Ity_I32);
2023//ZZ    assign( z32, mkU32(0) );
2024//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2025//ZZ }
2026//ZZ
2027//ZZ
2028//ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2029//ZZ    sets them at all) */
2030//ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2031//ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2032//ZZ {
2033//ZZ    IRTemp z32 = newTemp(Ity_I32);
2034//ZZ    assign( z32, mkU32(0) );
2035//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2036//ZZ }
2037
2038
2039/*------------------------------------------------------------*/
2040/*--- Misc math helpers                                    ---*/
2041/*------------------------------------------------------------*/
2042
2043/* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2044static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2045{
2046   IRTemp maskT = newTemp(Ity_I64);
2047   IRTemp res   = newTemp(Ity_I64);
2048   vassert(sh >= 1 && sh <= 63);
2049   assign(maskT, mkU64(mask));
2050   assign( res,
2051           binop(Iop_Or64,
2052                 binop(Iop_Shr64,
2053                       binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2054                       mkU8(sh)),
2055                 binop(Iop_And64,
2056                       binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2057                       mkexpr(maskT))
2058                 )
2059           );
2060   return res;
2061}
2062
2063/* Generates byte swaps within 32-bit lanes. */
2064static IRTemp math_UINTSWAP64 ( IRTemp src )
2065{
2066   IRTemp res;
2067   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2068   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2069   return res;
2070}
2071
2072/* Generates byte swaps within 16-bit lanes. */
2073static IRTemp math_USHORTSWAP64 ( IRTemp src )
2074{
2075   IRTemp res;
2076   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2077   return res;
2078}
2079
2080/* Generates a 64-bit byte swap. */
2081static IRTemp math_BYTESWAP64 ( IRTemp src )
2082{
2083   IRTemp res;
2084   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2085   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2086   res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2087   return res;
2088}
2089
2090/* Generates a 64-bit bit swap. */
2091static IRTemp math_BITSWAP64 ( IRTemp src )
2092{
2093   IRTemp res;
2094   res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2095   res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2096   res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2097   return math_BYTESWAP64(res);
2098}
2099
2100/* Duplicates the bits at the bottom of the given word to fill the
2101   whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2102   except for the bottom bits. */
2103static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2104{
2105   if (srcTy == Ity_I8) {
2106      IRTemp t16 = newTemp(Ity_I64);
2107      assign(t16, binop(Iop_Or64, mkexpr(src),
2108                                  binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2109      IRTemp t32 = newTemp(Ity_I64);
2110      assign(t32, binop(Iop_Or64, mkexpr(t16),
2111                                  binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2112      IRTemp t64 = newTemp(Ity_I64);
2113      assign(t64, binop(Iop_Or64, mkexpr(t32),
2114                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2115      return t64;
2116   }
2117   if (srcTy == Ity_I16) {
2118      IRTemp t32 = newTemp(Ity_I64);
2119      assign(t32, binop(Iop_Or64, mkexpr(src),
2120                                  binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2121      IRTemp t64 = newTemp(Ity_I64);
2122      assign(t64, binop(Iop_Or64, mkexpr(t32),
2123                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2124      return t64;
2125   }
2126   if (srcTy == Ity_I32) {
2127      IRTemp t64 = newTemp(Ity_I64);
2128      assign(t64, binop(Iop_Or64, mkexpr(src),
2129                                  binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2130      return t64;
2131   }
2132   if (srcTy == Ity_I64) {
2133      return src;
2134   }
2135   vassert(0);
2136}
2137
2138
2139/* Duplicates the src element exactly so as to fill a V128 value. */
2140static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2141{
2142   IRTemp res = newTempV128();
2143   if (srcTy == Ity_F64) {
2144      IRTemp i64 = newTemp(Ity_I64);
2145      assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2146      assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2147      return res;
2148   }
2149   if (srcTy == Ity_F32) {
2150      IRTemp i64a = newTemp(Ity_I64);
2151      assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2152      IRTemp i64b = newTemp(Ity_I64);
2153      assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2154                                   mkexpr(i64a)));
2155      assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2156      return res;
2157   }
2158   if (srcTy == Ity_I64) {
2159      assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2160      return res;
2161   }
2162   if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2163      IRTemp t1 = newTemp(Ity_I64);
2164      assign(t1, widenUto64(srcTy, mkexpr(src)));
2165      IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2166      assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2167      return res;
2168   }
2169   vassert(0);
2170}
2171
2172
2173/* |fullWidth| is a full V128 width result.  Depending on bitQ,
2174   zero out the upper half. */
2175static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2176{
2177   if (bitQ == 1) return mkexpr(fullWidth);
2178   if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2179   vassert(0);
2180}
2181
2182/* The same, but from an expression instead. */
2183static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2184{
2185   IRTemp fullWidthT = newTempV128();
2186   assign(fullWidthT, fullWidth);
2187   return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2188}
2189
2190
2191/*------------------------------------------------------------*/
2192/*--- FP comparison helpers                                ---*/
2193/*------------------------------------------------------------*/
2194
2195/* irRes :: Ity_I32 holds a floating point comparison result encoded
2196   as an IRCmpF64Result.  Generate code to convert it to an
2197   ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2198   Assign a new temp to hold that value, and return the temp. */
2199static
2200IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2201{
2202   IRTemp ix       = newTemp(Ity_I64);
2203   IRTemp termL    = newTemp(Ity_I64);
2204   IRTemp termR    = newTemp(Ity_I64);
2205   IRTemp nzcv     = newTemp(Ity_I64);
2206   IRTemp irRes    = newTemp(Ity_I64);
2207
2208   /* This is where the fun starts.  We have to convert 'irRes' from
2209      an IR-convention return result (IRCmpF64Result) to an
2210      ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2211      4 bits of 'nzcv'. */
2212   /* Map compare result from IR to ARM(nzcv) */
2213   /*
2214      FP cmp result | IR   | ARM(nzcv)
2215      --------------------------------
2216      UN              0x45   0011
2217      LT              0x01   1000
2218      GT              0x00   0010
2219      EQ              0x40   0110
2220   */
2221   /* Now since you're probably wondering WTF ..
2222
2223      ix fishes the useful bits out of the IR value, bits 6 and 0, and
2224      places them side by side, giving a number which is 0, 1, 2 or 3.
2225
2226      termL is a sequence cooked up by GNU superopt.  It converts ix
2227         into an almost correct value NZCV value (incredibly), except
2228         for the case of UN, where it produces 0100 instead of the
2229         required 0011.
2230
2231      termR is therefore a correction term, also computed from ix.  It
2232         is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2233         the final correct value, we subtract termR from termL.
2234
2235      Don't take my word for it.  There's a test program at the bottom
2236      of guest_arm_toIR.c, to try this out with.
2237   */
2238   assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2239
2240   assign(
2241      ix,
2242      binop(Iop_Or64,
2243            binop(Iop_And64,
2244                  binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2245                  mkU64(3)),
2246            binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2247
2248   assign(
2249      termL,
2250      binop(Iop_Add64,
2251            binop(Iop_Shr64,
2252                  binop(Iop_Sub64,
2253                        binop(Iop_Shl64,
2254                              binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2255                              mkU8(62)),
2256                        mkU64(1)),
2257                  mkU8(61)),
2258            mkU64(1)));
2259
2260   assign(
2261      termR,
2262      binop(Iop_And64,
2263            binop(Iop_And64,
2264                  mkexpr(ix),
2265                  binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2266            mkU64(1)));
2267
2268   assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2269   return nzcv;
2270}
2271
2272
2273/*------------------------------------------------------------*/
2274/*--- Data processing (immediate)                          ---*/
2275/*------------------------------------------------------------*/
2276
2277/* Helper functions for supporting "DecodeBitMasks" */
2278
2279static ULong dbm_ROR ( Int width, ULong x, Int rot )
2280{
2281   vassert(width > 0 && width <= 64);
2282   vassert(rot >= 0 && rot < width);
2283   if (rot == 0) return x;
2284   ULong res = x >> rot;
2285   res |= (x << (width - rot));
2286   if (width < 64)
2287     res &= ((1ULL << width) - 1);
2288   return res;
2289}
2290
2291static ULong dbm_RepTo64( Int esize, ULong x )
2292{
2293   switch (esize) {
2294      case 64:
2295         return x;
2296      case 32:
2297         x &= 0xFFFFFFFF; x |= (x << 32);
2298         return x;
2299      case 16:
2300         x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2301         return x;
2302      case 8:
2303         x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2304         return x;
2305      case 4:
2306         x &= 0xF; x |= (x << 4); x |= (x << 8);
2307         x |= (x << 16); x |= (x << 32);
2308         return x;
2309      case 2:
2310         x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2311         x |= (x << 16); x |= (x << 32);
2312         return x;
2313      default:
2314         break;
2315   }
2316   vpanic("dbm_RepTo64");
2317   /*NOTREACHED*/
2318   return 0;
2319}
2320
2321static Int dbm_highestSetBit ( ULong x )
2322{
2323   Int i;
2324   for (i = 63; i >= 0; i--) {
2325      if (x & (1ULL << i))
2326         return i;
2327   }
2328   vassert(x == 0);
2329   return -1;
2330}
2331
2332static
2333Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2334                          ULong immN, ULong imms, ULong immr, Bool immediate,
2335                          UInt M /*32 or 64*/)
2336{
2337   vassert(immN < (1ULL << 1));
2338   vassert(imms < (1ULL << 6));
2339   vassert(immr < (1ULL << 6));
2340   vassert(immediate == False || immediate == True);
2341   vassert(M == 32 || M == 64);
2342
2343   Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2344   if (len < 1) { /* printf("fail1\n"); */ return False; }
2345   vassert(len <= 6);
2346   vassert(M >= (1 << len));
2347
2348   vassert(len >= 1 && len <= 6);
2349   ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2350                  (1 << len) - 1;
2351   vassert(levels >= 1 && levels <= 63);
2352
2353   if (immediate && ((imms & levels) == levels)) {
2354      /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2355      return False;
2356   }
2357
2358   ULong S = imms & levels;
2359   ULong R = immr & levels;
2360   Int   diff = S - R;
2361   diff &= 63;
2362   Int esize = 1 << len;
2363   vassert(2 <= esize && esize <= 64);
2364
2365   /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2366      same below with d.  S can be 63 in which case we have an out of
2367      range and hence undefined shift. */
2368   vassert(S >= 0 && S <= 63);
2369   vassert(esize >= (S+1));
2370   ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2371                  //(1ULL << (S+1)) - 1;
2372                  ((1ULL << S) - 1) + (1ULL << S);
2373
2374   Int d = // diff<len-1:0>
2375           diff & ((1 << len)-1);
2376   vassert(esize >= (d+1));
2377   vassert(d >= 0 && d <= 63);
2378
2379   ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2380                  //(1ULL << (d+1)) - 1;
2381                  ((1ULL << d) - 1) + (1ULL << d);
2382
2383   if (esize != 64) vassert(elem_s < (1ULL << esize));
2384   if (esize != 64) vassert(elem_d < (1ULL << esize));
2385
2386   if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2387   if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2388
2389   return True;
2390}
2391
2392
2393static
2394Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2395                                         UInt insn)
2396{
2397#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2398
2399   /* insn[28:23]
2400      10000x PC-rel addressing
2401      10001x Add/subtract (immediate)
2402      100100 Logical (immediate)
2403      100101 Move Wide (immediate)
2404      100110 Bitfield
2405      100111 Extract
2406   */
2407
2408   /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2409   if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2410      Bool is64   = INSN(31,31) == 1;
2411      Bool isSub  = INSN(30,30) == 1;
2412      Bool setCC  = INSN(29,29) == 1;
2413      UInt sh     = INSN(23,22);
2414      UInt uimm12 = INSN(21,10);
2415      UInt nn     = INSN(9,5);
2416      UInt dd     = INSN(4,0);
2417      const HChar* nm = isSub ? "sub" : "add";
2418      if (sh >= 2) {
2419         /* Invalid; fall through */
2420      } else {
2421         vassert(sh <= 1);
2422         uimm12 <<= (12 * sh);
2423         if (is64) {
2424            IRTemp argL  = newTemp(Ity_I64);
2425            IRTemp argR  = newTemp(Ity_I64);
2426            IRTemp res   = newTemp(Ity_I64);
2427            assign(argL, getIReg64orSP(nn));
2428            assign(argR, mkU64(uimm12));
2429            assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2430                               mkexpr(argL), mkexpr(argR)));
2431            if (setCC) {
2432               putIReg64orZR(dd, mkexpr(res));
2433               setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2434               DIP("%ss %s, %s, 0x%x\n",
2435                   nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2436            } else {
2437               putIReg64orSP(dd, mkexpr(res));
2438               DIP("%s %s, %s, 0x%x\n",
2439                   nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2440            }
2441         } else {
2442            IRTemp argL  = newTemp(Ity_I32);
2443            IRTemp argR  = newTemp(Ity_I32);
2444            IRTemp res   = newTemp(Ity_I32);
2445            assign(argL, getIReg32orSP(nn));
2446            assign(argR, mkU32(uimm12));
2447            assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2448                               mkexpr(argL), mkexpr(argR)));
2449            if (setCC) {
2450               putIReg32orZR(dd, mkexpr(res));
2451               setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2452               DIP("%ss %s, %s, 0x%x\n",
2453                   nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2454            } else {
2455               putIReg32orSP(dd, mkexpr(res));
2456               DIP("%s %s, %s, 0x%x\n",
2457                   nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2458            }
2459         }
2460         return True;
2461      }
2462   }
2463
2464   /* -------------------- ADR/ADRP -------------------- */
2465   if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2466      UInt  bP    = INSN(31,31);
2467      UInt  immLo = INSN(30,29);
2468      UInt  immHi = INSN(23,5);
2469      UInt  rD    = INSN(4,0);
2470      ULong uimm  = (immHi << 2) | immLo;
2471      ULong simm  = sx_to_64(uimm, 21);
2472      ULong val;
2473      if (bP) {
2474         val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2475      } else {
2476         val = guest_PC_curr_instr + simm;
2477      }
2478      putIReg64orZR(rD, mkU64(val));
2479      DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2480      return True;
2481   }
2482
2483   /* -------------------- LOGIC(imm) -------------------- */
2484   if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2485      /* 31 30 28     22 21   15   9  4
2486         sf op 100100 N  immr imms Rn Rd
2487           op=00: AND  Rd|SP, Rn, #imm
2488           op=01: ORR  Rd|SP, Rn, #imm
2489           op=10: EOR  Rd|SP, Rn, #imm
2490           op=11: ANDS Rd|ZR, Rn, #imm
2491      */
2492      Bool  is64 = INSN(31,31) == 1;
2493      UInt  op   = INSN(30,29);
2494      UInt  N    = INSN(22,22);
2495      UInt  immR = INSN(21,16);
2496      UInt  immS = INSN(15,10);
2497      UInt  nn   = INSN(9,5);
2498      UInt  dd   = INSN(4,0);
2499      ULong imm  = 0;
2500      Bool  ok;
2501      if (N == 1 && !is64)
2502         goto after_logic_imm; /* not allowed; fall through */
2503      ok = dbm_DecodeBitMasks(&imm, NULL,
2504                              N, immS, immR, True, is64 ? 64 : 32);
2505      if (!ok)
2506         goto after_logic_imm;
2507
2508      const HChar* names[4] = { "and", "orr", "eor", "ands" };
2509      const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2510      const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2511
2512      vassert(op < 4);
2513      if (is64) {
2514         IRExpr* argL = getIReg64orZR(nn);
2515         IRExpr* argR = mkU64(imm);
2516         IRTemp  res  = newTemp(Ity_I64);
2517         assign(res, binop(ops64[op], argL, argR));
2518         if (op < 3) {
2519            putIReg64orSP(dd, mkexpr(res));
2520            DIP("%s %s, %s, 0x%llx\n", names[op],
2521                nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2522         } else {
2523            putIReg64orZR(dd, mkexpr(res));
2524            setFlags_LOGIC(True/*is64*/, res);
2525            DIP("%s %s, %s, 0x%llx\n", names[op],
2526                nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2527         }
2528      } else {
2529         IRExpr* argL = getIReg32orZR(nn);
2530         IRExpr* argR = mkU32((UInt)imm);
2531         IRTemp  res  = newTemp(Ity_I32);
2532         assign(res, binop(ops32[op], argL, argR));
2533         if (op < 3) {
2534            putIReg32orSP(dd, mkexpr(res));
2535            DIP("%s %s, %s, 0x%x\n", names[op],
2536                nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2537         } else {
2538            putIReg32orZR(dd, mkexpr(res));
2539            setFlags_LOGIC(False/*!is64*/, res);
2540            DIP("%s %s, %s, 0x%x\n", names[op],
2541                nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2542         }
2543      }
2544      return True;
2545   }
2546   after_logic_imm:
2547
2548   /* -------------------- MOV{Z,N,K} -------------------- */
2549   if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2550      /* 31 30 28      22 20    4
2551         |  |  |       |  |     |
2552         sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2553         sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2554         sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2555      */
2556      Bool is64   = INSN(31,31) == 1;
2557      UInt subopc = INSN(30,29);
2558      UInt hw     = INSN(22,21);
2559      UInt imm16  = INSN(20,5);
2560      UInt dd     = INSN(4,0);
2561      if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2562         /* invalid; fall through */
2563      } else {
2564         ULong imm64 = ((ULong)imm16) << (16 * hw);
2565         if (!is64)
2566            vassert(imm64 < 0x100000000ULL);
2567         switch (subopc) {
2568            case BITS2(1,0): // MOVZ
2569               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2570               DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2571               break;
2572            case BITS2(0,0): // MOVN
2573               imm64 = ~imm64;
2574               if (!is64)
2575                  imm64 &= 0xFFFFFFFFULL;
2576               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2577               DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2578               break;
2579            case BITS2(1,1): // MOVK
2580               /* This is more complex.  We are inserting a slice into
2581                  the destination register, so we need to have the old
2582                  value of it. */
2583               if (is64) {
2584                  IRTemp old = newTemp(Ity_I64);
2585                  assign(old, getIReg64orZR(dd));
2586                  ULong mask = 0xFFFFULL << (16 * hw);
2587                  IRExpr* res
2588                     = binop(Iop_Or64,
2589                             binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2590                             mkU64(imm64));
2591                  putIReg64orZR(dd, res);
2592                  DIP("movk %s, 0x%x, lsl %u\n",
2593                      nameIReg64orZR(dd), imm16, 16*hw);
2594               } else {
2595                  IRTemp old = newTemp(Ity_I32);
2596                  assign(old, getIReg32orZR(dd));
2597                  vassert(hw <= 1);
2598                  UInt mask = ((UInt)0xFFFF) << (16 * hw);
2599                  IRExpr* res
2600                     = binop(Iop_Or32,
2601                             binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2602                             mkU32((UInt)imm64));
2603                  putIReg32orZR(dd, res);
2604                  DIP("movk %s, 0x%x, lsl %u\n",
2605                      nameIReg32orZR(dd), imm16, 16*hw);
2606               }
2607               break;
2608            default:
2609               vassert(0);
2610         }
2611         return True;
2612      }
2613   }
2614
2615   /* -------------------- {U,S,}BFM -------------------- */
2616   /*    30 28     22 21   15   9  4
2617
2618      sf 10 100110 N  immr imms nn dd
2619         UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2620         UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2621
2622      sf 00 100110 N  immr imms nn dd
2623         SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2624         SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2625
2626      sf 01 100110 N  immr imms nn dd
2627         BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2628         BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2629   */
2630   if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2631      UInt sf     = INSN(31,31);
2632      UInt opc    = INSN(30,29);
2633      UInt N      = INSN(22,22);
2634      UInt immR   = INSN(21,16);
2635      UInt immS   = INSN(15,10);
2636      UInt nn     = INSN(9,5);
2637      UInt dd     = INSN(4,0);
2638      Bool inZero = False;
2639      Bool extend = False;
2640      const HChar* nm = "???";
2641      /* skip invalid combinations */
2642      switch (opc) {
2643         case BITS2(0,0):
2644            inZero = True; extend = True; nm = "sbfm"; break;
2645         case BITS2(0,1):
2646            inZero = False; extend = False; nm = "bfm"; break;
2647         case BITS2(1,0):
2648            inZero = True; extend = False; nm = "ubfm"; break;
2649         case BITS2(1,1):
2650            goto after_bfm; /* invalid */
2651         default:
2652            vassert(0);
2653      }
2654      if (sf == 1 && N != 1) goto after_bfm;
2655      if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2656                             || ((immS >> 5) & 1) != 0)) goto after_bfm;
2657      ULong wmask = 0, tmask = 0;
2658      Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2659                                   N, immS, immR, False, sf == 1 ? 64 : 32);
2660      if (!ok) goto after_bfm; /* hmmm */
2661
2662      Bool   is64 = sf == 1;
2663      IRType ty   = is64 ? Ity_I64 : Ity_I32;
2664
2665      IRTemp dst = newTemp(ty);
2666      IRTemp src = newTemp(ty);
2667      IRTemp bot = newTemp(ty);
2668      IRTemp top = newTemp(ty);
2669      IRTemp res = newTemp(ty);
2670      assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2671      assign(src, getIRegOrZR(is64, nn));
2672      /* perform bitfield move on low bits */
2673      assign(bot, binop(mkOR(ty),
2674                        binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2675                        binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2676                                         mkU(ty, wmask))));
2677      /* determine extension bits (sign, zero or dest register) */
2678      assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2679      /* combine extension bits and result bits */
2680      assign(res, binop(mkOR(ty),
2681                        binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2682                        binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2683      putIRegOrZR(is64, dd, mkexpr(res));
2684      DIP("%s %s, %s, immR=%u, immS=%u\n",
2685          nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2686      return True;
2687   }
2688   after_bfm:
2689
2690   /* ---------------------- EXTR ---------------------- */
2691   /*   30 28     22 20 15   9 4
2692      1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2693      0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2694   */
2695   if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2696      Bool is64  = INSN(31,31) == 1;
2697      UInt mm    = INSN(20,16);
2698      UInt imm6  = INSN(15,10);
2699      UInt nn    = INSN(9,5);
2700      UInt dd    = INSN(4,0);
2701      Bool valid = True;
2702      if (INSN(31,31) != INSN(22,22))
2703        valid = False;
2704      if (!is64 && imm6 >= 32)
2705        valid = False;
2706      if (!valid) goto after_extr;
2707      IRType ty    = is64 ? Ity_I64 : Ity_I32;
2708      IRTemp srcHi = newTemp(ty);
2709      IRTemp srcLo = newTemp(ty);
2710      IRTemp res   = newTemp(ty);
2711      assign(srcHi, getIRegOrZR(is64, nn));
2712      assign(srcLo, getIRegOrZR(is64, mm));
2713      if (imm6 == 0) {
2714        assign(res, mkexpr(srcLo));
2715      } else {
2716        UInt szBits = 8 * sizeofIRType(ty);
2717        vassert(imm6 > 0 && imm6 < szBits);
2718        assign(res, binop(mkOR(ty),
2719                          binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2720                          binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2721      }
2722      putIRegOrZR(is64, dd, mkexpr(res));
2723      DIP("extr %s, %s, %s, #%u\n",
2724          nameIRegOrZR(is64,dd),
2725          nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2726      return True;
2727   }
2728  after_extr:
2729
2730   vex_printf("ARM64 front end: data_processing_immediate\n");
2731   return False;
2732#  undef INSN
2733}
2734
2735
2736/*------------------------------------------------------------*/
2737/*--- Data processing (register) instructions              ---*/
2738/*------------------------------------------------------------*/
2739
2740static const HChar* nameSH ( UInt sh ) {
2741   switch (sh) {
2742      case 0: return "lsl";
2743      case 1: return "lsr";
2744      case 2: return "asr";
2745      case 3: return "ror";
2746      default: vassert(0);
2747   }
2748}
2749
2750/* Generate IR to get a register value, possibly shifted by an
2751   immediate.  Returns either a 32- or 64-bit temporary holding the
2752   result.  After the shift, the value can optionally be NOT-ed
2753   too.
2754
2755   sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2756   in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2757   isn't allowed, but it's the job of the caller to check that.
2758*/
2759static IRTemp getShiftedIRegOrZR ( Bool is64,
2760                                   UInt sh_how, UInt sh_amt, UInt regNo,
2761                                   Bool invert )
2762{
2763   vassert(sh_how < 4);
2764   vassert(sh_amt < (is64 ? 64 : 32));
2765   IRType ty = is64 ? Ity_I64 : Ity_I32;
2766   IRTemp t0 = newTemp(ty);
2767   assign(t0, getIRegOrZR(is64, regNo));
2768   IRTemp t1 = newTemp(ty);
2769   switch (sh_how) {
2770      case BITS2(0,0):
2771         assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2772         break;
2773      case BITS2(0,1):
2774         assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2775         break;
2776      case BITS2(1,0):
2777         assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2778         break;
2779      case BITS2(1,1):
2780         assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2781         break;
2782      default:
2783         vassert(0);
2784   }
2785   if (invert) {
2786      IRTemp t2 = newTemp(ty);
2787      assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2788      return t2;
2789   } else {
2790      return t1;
2791   }
2792}
2793
2794
2795static
2796Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2797                                        UInt insn)
2798{
2799#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2800
2801   /* ------------------- ADD/SUB(reg) ------------------- */
2802   /* x==0 => 32 bit op      x==1 => 64 bit op
2803      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2804
2805      31 30 29 28    23 21 20 15   9  4
2806      |  |  |  |     |  |  |  |    |  |
2807      x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2808      x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2809      x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2810      x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2811   */
2812   if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2813      UInt   bX    = INSN(31,31);
2814      UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2815      UInt   bS    = INSN(29, 29); /* set flags? */
2816      UInt   sh    = INSN(23,22);
2817      UInt   rM    = INSN(20,16);
2818      UInt   imm6  = INSN(15,10);
2819      UInt   rN    = INSN(9,5);
2820      UInt   rD    = INSN(4,0);
2821      Bool   isSUB = bOP == 1;
2822      Bool   is64  = bX == 1;
2823      IRType ty    = is64 ? Ity_I64 : Ity_I32;
2824      if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2825         /* invalid; fall through */
2826      } else {
2827         IRTemp argL = newTemp(ty);
2828         assign(argL, getIRegOrZR(is64, rN));
2829         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2830         IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2831         IRTemp res  = newTemp(ty);
2832         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2833         if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2834         if (bS) {
2835            setFlags_ADD_SUB(is64, isSUB, argL, argR);
2836         }
2837         DIP("%s%s %s, %s, %s, %s #%u\n",
2838             bOP ? "sub" : "add", bS ? "s" : "",
2839             nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2840             nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2841         return True;
2842      }
2843   }
2844
2845   /* ------------------- ADC/SBC(reg) ------------------- */
2846   /* x==0 => 32 bit op      x==1 => 64 bit op
2847
2848      31 30 29 28    23 21 20 15     9  4
2849      |  |  |  |     |  |  |  |      |  |
2850      x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2851      x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2852      x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2853      x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2854   */
2855
2856   if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2857      UInt   bX    = INSN(31,31);
2858      UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2859      UInt   bS    = INSN(29,29); /* set flags */
2860      UInt   rM    = INSN(20,16);
2861      UInt   rN    = INSN(9,5);
2862      UInt   rD    = INSN(4,0);
2863
2864      Bool   isSUB = bOP == 1;
2865      Bool   is64  = bX == 1;
2866      IRType ty    = is64 ? Ity_I64 : Ity_I32;
2867
2868      IRTemp oldC = newTemp(ty);
2869      assign(oldC,
2870             is64 ? mk_arm64g_calculate_flag_c()
2871                  : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
2872
2873      IRTemp argL = newTemp(ty);
2874      assign(argL, getIRegOrZR(is64, rN));
2875      IRTemp argR = newTemp(ty);
2876      assign(argR, getIRegOrZR(is64, rM));
2877
2878      IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2879      IRTemp res  = newTemp(ty);
2880      if (isSUB) {
2881         IRExpr* one = is64 ? mkU64(1) : mkU32(1);
2882         IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
2883         assign(res,
2884                binop(op,
2885                      binop(op, mkexpr(argL), mkexpr(argR)),
2886                      binop(xorOp, mkexpr(oldC), one)));
2887      } else {
2888         assign(res,
2889                binop(op,
2890                      binop(op, mkexpr(argL), mkexpr(argR)),
2891                      mkexpr(oldC)));
2892      }
2893
2894      if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2895
2896      if (bS) {
2897         setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
2898      }
2899
2900      DIP("%s%s %s, %s, %s\n",
2901          bOP ? "sbc" : "adc", bS ? "s" : "",
2902          nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2903          nameIRegOrZR(is64, rM));
2904      return True;
2905   }
2906
2907   /* -------------------- LOGIC(reg) -------------------- */
2908   /* x==0 => 32 bit op      x==1 => 64 bit op
2909      N==0 => inv? is no-op (no inversion)
2910      N==1 => inv? is NOT
2911      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2912
2913      31 30 28    23 21 20 15   9  4
2914      |  |  |     |  |  |  |    |  |
2915      x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
2916      x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
2917      x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
2918      x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
2919      With N=1, the names are: BIC ORN EON BICS
2920   */
2921   if (INSN(28,24) == BITS5(0,1,0,1,0)) {
2922      UInt   bX   = INSN(31,31);
2923      UInt   sh   = INSN(23,22);
2924      UInt   bN   = INSN(21,21);
2925      UInt   rM   = INSN(20,16);
2926      UInt   imm6 = INSN(15,10);
2927      UInt   rN   = INSN(9,5);
2928      UInt   rD   = INSN(4,0);
2929      Bool   is64 = bX == 1;
2930      IRType ty   = is64 ? Ity_I64 : Ity_I32;
2931      if (!is64 && imm6 > 31) {
2932         /* invalid; fall though */
2933      } else {
2934         IRTemp argL = newTemp(ty);
2935         assign(argL, getIRegOrZR(is64, rN));
2936         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
2937         IROp   op   = Iop_INVALID;
2938         switch (INSN(30,29)) {
2939            case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
2940            case BITS2(0,1):                  op = mkOR(ty);  break;
2941            case BITS2(1,0):                  op = mkXOR(ty); break;
2942            default: vassert(0);
2943         }
2944         IRTemp res = newTemp(ty);
2945         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2946         if (INSN(30,29) == BITS2(1,1)) {
2947            setFlags_LOGIC(is64, res);
2948         }
2949         putIRegOrZR(is64, rD, mkexpr(res));
2950
2951         static const HChar* names_op[8]
2952            = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
2953         vassert(((bN << 2) | INSN(30,29)) < 8);
2954         const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
2955         /* Special-case the printing of "MOV" */
2956         if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
2957            DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
2958                                nameIRegOrZR(is64, rM));
2959         } else {
2960            DIP("%s %s, %s, %s, %s #%u\n", nm_op,
2961                nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2962                nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2963         }
2964         return True;
2965      }
2966   }
2967
2968   /* -------------------- {U,S}MULH -------------------- */
2969   /* 31       23 22 20 15     9   4
2970      10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
2971      10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
2972   */
2973   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
2974       && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
2975      Bool isU = INSN(23,23) == 1;
2976      UInt mm  = INSN(20,16);
2977      UInt nn  = INSN(9,5);
2978      UInt dd  = INSN(4,0);
2979      putIReg64orZR(dd, unop(Iop_128HIto64,
2980                             binop(isU ? Iop_MullU64 : Iop_MullS64,
2981                                   getIReg64orZR(nn), getIReg64orZR(mm))));
2982      DIP("%cmulh %s, %s, %s\n",
2983          isU ? 'u' : 's',
2984          nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
2985      return True;
2986   }
2987
2988   /* -------------------- M{ADD,SUB} -------------------- */
2989   /* 31 30           20 15 14 9 4
2990      sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
2991      sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
2992   */
2993   if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
2994      Bool is64  = INSN(31,31) == 1;
2995      UInt mm    = INSN(20,16);
2996      Bool isAdd = INSN(15,15) == 0;
2997      UInt aa    = INSN(14,10);
2998      UInt nn    = INSN(9,5);
2999      UInt dd    = INSN(4,0);
3000      if (is64) {
3001         putIReg64orZR(
3002            dd,
3003            binop(isAdd ? Iop_Add64 : Iop_Sub64,
3004                  getIReg64orZR(aa),
3005                  binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3006      } else {
3007         putIReg32orZR(
3008            dd,
3009            binop(isAdd ? Iop_Add32 : Iop_Sub32,
3010                  getIReg32orZR(aa),
3011                  binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3012      }
3013      DIP("%s %s, %s, %s, %s\n",
3014          isAdd ? "madd" : "msub",
3015          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3016          nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3017      return True;
3018   }
3019
3020   /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3021   /* 31 30 28        20 15   11 9  4
3022      sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3023      sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3024      sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3025      sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3026      In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3027   */
3028   if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3029      Bool    is64 = INSN(31,31) == 1;
3030      UInt    b30  = INSN(30,30);
3031      UInt    mm   = INSN(20,16);
3032      UInt    cond = INSN(15,12);
3033      UInt    b10  = INSN(10,10);
3034      UInt    nn   = INSN(9,5);
3035      UInt    dd   = INSN(4,0);
3036      UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3037      IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3038      IRExpr* argL = getIRegOrZR(is64, nn);
3039      IRExpr* argR = getIRegOrZR(is64, mm);
3040      switch (op) {
3041         case BITS2(0,0):
3042            break;
3043         case BITS2(0,1):
3044            argR = binop(mkADD(ty), argR, mkU(ty,1));
3045            break;
3046         case BITS2(1,0):
3047            argR = unop(mkNOT(ty), argR);
3048            break;
3049         case BITS2(1,1):
3050            argR = binop(mkSUB(ty), mkU(ty,0), argR);
3051            break;
3052         default:
3053            vassert(0);
3054      }
3055      putIRegOrZR(
3056         is64, dd,
3057         IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3058                    argL, argR)
3059      );
3060      const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3061      DIP("%s %s, %s, %s, %s\n", op_nm[op],
3062          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3063          nameIRegOrZR(is64, mm), nameCC(cond));
3064      return True;
3065   }
3066
3067   /* -------------- ADD/SUB(extended reg) -------------- */
3068   /*     28         20 15  12   9 4
3069      000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3070      100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3071
3072      001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3073      101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3074
3075      010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3076      110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3077
3078      011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3079      111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3080
3081      The 'm' operand is extended per opt, thusly:
3082
3083        000   Xm & 0xFF           UXTB
3084        001   Xm & 0xFFFF         UXTH
3085        010   Xm & (2^32)-1       UXTW
3086        011   Xm                  UXTX
3087
3088        100   Xm sx from bit 7    SXTB
3089        101   Xm sx from bit 15   SXTH
3090        110   Xm sx from bit 31   SXTW
3091        111   Xm                  SXTX
3092
3093      In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3094      operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3095      are the identity operation on Wm.
3096
3097      After extension, the value is shifted left by imm3 bits, which
3098      may only be in the range 0 .. 4 inclusive.
3099   */
3100   if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3101      Bool is64  = INSN(31,31) == 1;
3102      Bool isSub = INSN(30,30) == 1;
3103      Bool setCC = INSN(29,29) == 1;
3104      UInt mm    = INSN(20,16);
3105      UInt opt   = INSN(15,13);
3106      UInt imm3  = INSN(12,10);
3107      UInt nn    = INSN(9,5);
3108      UInt dd    = INSN(4,0);
3109      const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3110                                  "sxtb", "sxth", "sxtw", "sxtx" };
3111      /* Do almost the same thing in the 32- and 64-bit cases. */
3112      IRTemp xN = newTemp(Ity_I64);
3113      IRTemp xM = newTemp(Ity_I64);
3114      assign(xN, getIReg64orSP(nn));
3115      assign(xM, getIReg64orZR(mm));
3116      IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3117      Int     shSX = 0;
3118      /* widen Xm .. */
3119      switch (opt) {
3120         case BITS3(0,0,0): // UXTB
3121            xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3122         case BITS3(0,0,1): // UXTH
3123            xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3124         case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3125            if (is64) {
3126               xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3127            }
3128            break;
3129         case BITS3(0,1,1): // UXTX -- always a noop
3130            break;
3131         case BITS3(1,0,0): // SXTB
3132            shSX = 56; goto sxTo64;
3133         case BITS3(1,0,1): // SXTH
3134            shSX = 48; goto sxTo64;
3135         case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3136            if (is64) {
3137               shSX = 32; goto sxTo64;
3138            }
3139            break;
3140         case BITS3(1,1,1): // SXTX -- always a noop
3141            break;
3142         sxTo64:
3143            vassert(shSX >= 32);
3144            xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3145                        mkU8(shSX));
3146            break;
3147         default:
3148            vassert(0);
3149      }
3150      /* and now shift */
3151      IRTemp argL = xN;
3152      IRTemp argR = newTemp(Ity_I64);
3153      assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3154      IRTemp res = newTemp(Ity_I64);
3155      assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3156                        mkexpr(argL), mkexpr(argR)));
3157      if (is64) {
3158         if (setCC) {
3159            putIReg64orZR(dd, mkexpr(res));
3160            setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3161         } else {
3162            putIReg64orSP(dd, mkexpr(res));
3163         }
3164      } else {
3165         if (setCC) {
3166            IRTemp argL32 = newTemp(Ity_I32);
3167            IRTemp argR32 = newTemp(Ity_I32);
3168            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3169            assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3170            assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3171            setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3172         } else {
3173            putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3174         }
3175      }
3176      DIP("%s%s %s, %s, %s %s lsl %u\n",
3177          isSub ? "sub" : "add", setCC ? "s" : "",
3178          setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3179          nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3180          nameExt[opt], imm3);
3181      return True;
3182   }
3183
3184   /* ---------------- CCMP/CCMN(imm) ---------------- */
3185   /* Bizarrely, these appear in the "data processing register"
3186      category, even though they are operations against an
3187      immediate. */
3188   /* 31   29        20   15   11 9    3
3189      sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3190      sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3191
3192      Operation is:
3193         (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3194         (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3195   */
3196   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3197       && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3198      Bool is64  = INSN(31,31) == 1;
3199      Bool isSUB = INSN(30,30) == 1;
3200      UInt imm5  = INSN(20,16);
3201      UInt cond  = INSN(15,12);
3202      UInt nn    = INSN(9,5);
3203      UInt nzcv  = INSN(3,0);
3204
3205      IRTemp condT = newTemp(Ity_I1);
3206      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3207
3208      IRType ty   = is64 ? Ity_I64 : Ity_I32;
3209      IRTemp argL = newTemp(ty);
3210      IRTemp argR = newTemp(ty);
3211
3212      if (is64) {
3213         assign(argL, getIReg64orZR(nn));
3214         assign(argR, mkU64(imm5));
3215      } else {
3216         assign(argL, getIReg32orZR(nn));
3217         assign(argR, mkU32(imm5));
3218      }
3219      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3220
3221      DIP("ccm%c %s, #%u, #%u, %s\n",
3222          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3223          imm5, nzcv, nameCC(cond));
3224      return True;
3225   }
3226
3227   /* ---------------- CCMP/CCMN(reg) ---------------- */
3228   /* 31   29        20 15   11 9    3
3229      sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3230      sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3231      Operation is:
3232         (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3233         (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3234   */
3235   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3236       && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3237      Bool is64  = INSN(31,31) == 1;
3238      Bool isSUB = INSN(30,30) == 1;
3239      UInt mm    = INSN(20,16);
3240      UInt cond  = INSN(15,12);
3241      UInt nn    = INSN(9,5);
3242      UInt nzcv  = INSN(3,0);
3243
3244      IRTemp condT = newTemp(Ity_I1);
3245      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3246
3247      IRType ty   = is64 ? Ity_I64 : Ity_I32;
3248      IRTemp argL = newTemp(ty);
3249      IRTemp argR = newTemp(ty);
3250
3251      if (is64) {
3252         assign(argL, getIReg64orZR(nn));
3253         assign(argR, getIReg64orZR(mm));
3254      } else {
3255         assign(argL, getIReg32orZR(nn));
3256         assign(argR, getIReg32orZR(mm));
3257      }
3258      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3259
3260      DIP("ccm%c %s, %s, #%u, %s\n",
3261          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3262          nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3263      return True;
3264   }
3265
3266
3267   /* -------------- REV/REV16/REV32/RBIT -------------- */
3268   /* 31 30 28       20    15   11 9 4
3269
3270      1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3271      0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3272
3273      1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3274      0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3275
3276      1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3277      0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3278
3279      1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3280   */
3281   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3282       && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3283      UInt b31 = INSN(31,31);
3284      UInt opc = INSN(11,10);
3285
3286      UInt ix = 0;
3287      /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3288      else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3289      else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3290      else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3291      else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3292      else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3293      else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3294      if (ix >= 1 && ix <= 7) {
3295         Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3296         UInt   nn    = INSN(9,5);
3297         UInt   dd    = INSN(4,0);
3298         IRTemp src   = newTemp(Ity_I64);
3299         IRTemp dst   = IRTemp_INVALID;
3300         IRTemp (*math)(IRTemp) = NULL;
3301         switch (ix) {
3302            case 1: case 2: math = math_BYTESWAP64;   break;
3303            case 3: case 4: math = math_BITSWAP64;    break;
3304            case 5: case 6: math = math_USHORTSWAP64; break;
3305            case 7:         math = math_UINTSWAP64;   break;
3306            default: vassert(0);
3307         }
3308         const HChar* names[7]
3309           = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3310         const HChar* nm = names[ix-1];
3311         vassert(math);
3312         if (ix == 6) {
3313            /* This has to be special cased, since the logic below doesn't
3314               handle it correctly. */
3315            assign(src, getIReg64orZR(nn));
3316            dst = math(src);
3317            putIReg64orZR(dd,
3318                          unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3319         } else if (is64) {
3320            assign(src, getIReg64orZR(nn));
3321            dst = math(src);
3322            putIReg64orZR(dd, mkexpr(dst));
3323         } else {
3324            assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3325            dst = math(src);
3326            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3327         }
3328         DIP("%s %s, %s\n", nm,
3329             nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3330         return True;
3331      }
3332      /* else fall through */
3333   }
3334
3335   /* -------------------- CLZ/CLS -------------------- */
3336   /*    30 28   24   20    15      9 4
3337      sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3338      sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3339   */
3340   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3341       && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3342      Bool   is64  = INSN(31,31) == 1;
3343      Bool   isCLS = INSN(10,10) == 1;
3344      UInt   nn    = INSN(9,5);
3345      UInt   dd    = INSN(4,0);
3346      IRTemp src   = newTemp(Ity_I64);
3347      IRTemp srcZ  = newTemp(Ity_I64);
3348      IRTemp dst   = newTemp(Ity_I64);
3349      /* Get the argument, widened out to 64 bit */
3350      if (is64) {
3351         assign(src, getIReg64orZR(nn));
3352      } else {
3353         assign(src, binop(Iop_Shl64,
3354                           unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3355      }
3356      /* If this is CLS, mash the arg around accordingly */
3357      if (isCLS) {
3358         IRExpr* one = mkU8(1);
3359         assign(srcZ,
3360         binop(Iop_Xor64,
3361               binop(Iop_Shl64, mkexpr(src), one),
3362               binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3363      } else {
3364         assign(srcZ, mkexpr(src));
3365      }
3366      /* And compute CLZ. */
3367      if (is64) {
3368         assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3369                                mkU64(isCLS ? 63 : 64),
3370                                unop(Iop_Clz64, mkexpr(srcZ))));
3371         putIReg64orZR(dd, mkexpr(dst));
3372      } else {
3373         assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3374                                mkU64(isCLS ? 31 : 32),
3375                                unop(Iop_Clz64, mkexpr(srcZ))));
3376         putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3377      }
3378      DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3379          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3380      return True;
3381   }
3382
3383   /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3384   /*    30 28        20 15   11 9 4
3385      sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3386      sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3387      sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3388      sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3389   */
3390   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3391       && INSN(15,12) == BITS4(0,0,1,0)) {
3392      Bool   is64 = INSN(31,31) == 1;
3393      UInt   mm   = INSN(20,16);
3394      UInt   op   = INSN(11,10);
3395      UInt   nn   = INSN(9,5);
3396      UInt   dd   = INSN(4,0);
3397      IRType ty   = is64 ? Ity_I64 : Ity_I32;
3398      IRTemp srcL = newTemp(ty);
3399      IRTemp srcR = newTemp(Ity_I64);
3400      IRTemp res  = newTemp(ty);
3401      IROp   iop  = Iop_INVALID;
3402      assign(srcL, getIRegOrZR(is64, nn));
3403      assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3404                                    mkU64(is64 ? 63 : 31)));
3405      if (op < 3) {
3406         // LSLV, LSRV, ASRV
3407         switch (op) {
3408            case BITS2(0,0): iop = mkSHL(ty); break;
3409            case BITS2(0,1): iop = mkSHR(ty); break;
3410            case BITS2(1,0): iop = mkSAR(ty); break;
3411            default: vassert(0);
3412         }
3413         assign(res, binop(iop, mkexpr(srcL),
3414                                unop(Iop_64to8, mkexpr(srcR))));
3415      } else {
3416         // RORV
3417         IROp opSHL = mkSHL(ty);
3418         IROp opSHR = mkSHR(ty);
3419         IROp opOR  = mkOR(ty);
3420         IRExpr* width = mkU64(is64 ? 64: 32);
3421         assign(
3422            res,
3423            IRExpr_ITE(
3424               binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3425               mkexpr(srcL),
3426               binop(opOR,
3427                     binop(opSHL,
3428                           mkexpr(srcL),
3429                           unop(Iop_64to8, binop(Iop_Sub64, width,
3430                                                            mkexpr(srcR)))),
3431                     binop(opSHR,
3432                           mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3433         ));
3434      }
3435      putIRegOrZR(is64, dd, mkexpr(res));
3436      vassert(op < 4);
3437      const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3438      DIP("%s %s, %s, %s\n",
3439          names[op], nameIRegOrZR(is64,dd),
3440                     nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3441      return True;
3442   }
3443
3444   /* -------------------- SDIV/UDIV -------------------- */
3445   /*    30 28        20 15    10 9 4
3446      sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3447      sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3448   */
3449   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3450       && INSN(15,11) == BITS5(0,0,0,0,1)) {
3451      Bool is64 = INSN(31,31) == 1;
3452      UInt mm   = INSN(20,16);
3453      Bool isS  = INSN(10,10) == 1;
3454      UInt nn   = INSN(9,5);
3455      UInt dd   = INSN(4,0);
3456      if (isS) {
3457         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3458                                     getIRegOrZR(is64, nn),
3459                                     getIRegOrZR(is64, mm)));
3460      } else {
3461         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3462                                     getIRegOrZR(is64, nn),
3463                                     getIRegOrZR(is64, mm)));
3464      }
3465      DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3466          nameIRegOrZR(is64, dd),
3467          nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3468      return True;
3469   }
3470
3471   /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3472   /* 31        23  20 15 14 9 4
3473      1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3474      1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3475      1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3476      1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3477      with operation
3478         Xd = Xa +/- (Wn *u/s Wm)
3479   */
3480   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3481      Bool   isU   = INSN(23,23) == 1;
3482      UInt   mm    = INSN(20,16);
3483      Bool   isAdd = INSN(15,15) == 0;
3484      UInt   aa    = INSN(14,10);
3485      UInt   nn    = INSN(9,5);
3486      UInt   dd    = INSN(4,0);
3487      IRTemp wN    = newTemp(Ity_I32);
3488      IRTemp wM    = newTemp(Ity_I32);
3489      IRTemp xA    = newTemp(Ity_I64);
3490      IRTemp muld  = newTemp(Ity_I64);
3491      IRTemp res   = newTemp(Ity_I64);
3492      assign(wN, getIReg32orZR(nn));
3493      assign(wM, getIReg32orZR(mm));
3494      assign(xA, getIReg64orZR(aa));
3495      assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3496                         mkexpr(wN), mkexpr(wM)));
3497      assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3498                        mkexpr(xA), mkexpr(muld)));
3499      putIReg64orZR(dd, mkexpr(res));
3500      DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3501          nameIReg64orZR(dd), nameIReg32orZR(nn),
3502          nameIReg32orZR(mm), nameIReg64orZR(aa));
3503      return True;
3504   }
3505
3506   /* -------------------- CRC32/CRC32C -------------------- */
3507   /* 31 30           20 15   11 9 4
3508      sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3509      sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3510   */
3511   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3512       && INSN(15,13) == BITS3(0,1,0)) {
3513      UInt bitSF = INSN(31,31);
3514      UInt mm    = INSN(20,16);
3515      UInt bitC  = INSN(12,12);
3516      UInt sz    = INSN(11,10);
3517      UInt nn    = INSN(9,5);
3518      UInt dd    = INSN(4,0);
3519      vassert(sz >= 0 && sz <= 3);
3520      if ((bitSF == 0 && sz <= BITS2(1,0))
3521          || (bitSF == 1 && sz == BITS2(1,1))) {
3522         UInt ix = (bitC == 1 ? 4 : 0) | sz;
3523         void* helpers[8]
3524            = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3525                &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3526                &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3527                &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3528         const HChar* hNames[8]
3529            = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3530                "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3531                "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3532                "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3533         const HChar* iNames[8]
3534            = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3535                "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3536
3537         IRTemp srcN = newTemp(Ity_I64);
3538         assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3539
3540         IRTemp  srcM = newTemp(Ity_I64);
3541         IRExpr* at64 = getIReg64orZR(mm);
3542         switch (sz) {
3543            case BITS2(0,0):
3544               assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3545            case BITS2(0,1):
3546               assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3547            case BITS2(1,0):
3548               assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3549            case BITS2(1,1):
3550               assign(srcM, at64); break;
3551            default:
3552               vassert(0);
3553         }
3554
3555         vassert(ix >= 0 && ix <= 7);
3556
3557         putIReg64orZR(
3558            dd,
3559            unop(Iop_32Uto64,
3560                 unop(Iop_64to32,
3561                      mkIRExprCCall(Ity_I64, 0/*regparm*/,
3562                                    hNames[ix], helpers[ix],
3563                                    mkIRExprVec_2(mkexpr(srcN),
3564                                                  mkexpr(srcM))))));
3565
3566         DIP("%s %s, %s, %s\n", iNames[ix],
3567             nameIReg32orZR(dd),
3568             nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3569         return True;
3570      }
3571      /* fall through */
3572   }
3573
3574   vex_printf("ARM64 front end: data_processing_register\n");
3575   return False;
3576#  undef INSN
3577}
3578
3579
3580/*------------------------------------------------------------*/
3581/*--- Math helpers for vector interleave/deinterleave      ---*/
3582/*------------------------------------------------------------*/
3583
3584#define EX(_tmp) \
3585           mkexpr(_tmp)
3586#define SL(_hi128,_lo128,_nbytes) \
3587           ( (_nbytes) == 0 \
3588                ? (_lo128) \
3589                : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3590#define ROR(_v128,_nbytes) \
3591           SL((_v128),(_v128),(_nbytes))
3592#define ROL(_v128,_nbytes) \
3593           SL((_v128),(_v128),16-(_nbytes))
3594#define SHR(_v128,_nbytes) \
3595           binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3596#define SHL(_v128,_nbytes) \
3597           binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3598#define ILO64x2(_argL,_argR) \
3599           binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3600#define IHI64x2(_argL,_argR) \
3601           binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3602#define ILO32x4(_argL,_argR) \
3603           binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3604#define IHI32x4(_argL,_argR) \
3605           binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3606#define ILO16x8(_argL,_argR) \
3607           binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3608#define IHI16x8(_argL,_argR) \
3609           binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3610#define ILO8x16(_argL,_argR) \
3611           binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3612#define IHI8x16(_argL,_argR) \
3613           binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3614#define CEV32x4(_argL,_argR) \
3615           binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3616#define COD32x4(_argL,_argR) \
3617           binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3618#define COD16x8(_argL,_argR) \
3619           binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3620#define COD8x16(_argL,_argR) \
3621           binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3622#define CEV8x16(_argL,_argR) \
3623           binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3624#define AND(_arg1,_arg2) \
3625           binop(Iop_AndV128,(_arg1),(_arg2))
3626#define OR2(_arg1,_arg2) \
3627           binop(Iop_OrV128,(_arg1),(_arg2))
3628#define OR3(_arg1,_arg2,_arg3) \
3629           binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3630#define OR4(_arg1,_arg2,_arg3,_arg4) \
3631           binop(Iop_OrV128, \
3632                 binop(Iop_OrV128,(_arg1),(_arg2)), \
3633                 binop(Iop_OrV128,(_arg3),(_arg4)))
3634
3635
3636/* Do interleaving for 1 128 bit vector, for ST1 insns. */
3637static
3638void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3639                           UInt laneSzBlg2, IRTemp u0 )
3640{
3641   assign(*i0, mkexpr(u0));
3642}
3643
3644
3645/* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3646static
3647void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3648                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3649{
3650   /* This is pretty easy, since we have primitives directly to
3651      hand. */
3652   if (laneSzBlg2 == 3) {
3653      // 64x2
3654      // u1 == B1 B0, u0 == A1 A0
3655      // i1 == B1 A1, i0 == B0 A0
3656      assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3657      assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3658      return;
3659   }
3660   if (laneSzBlg2 == 2) {
3661      // 32x4
3662      // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3663      // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3664      assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3665      assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3666      return;
3667   }
3668   if (laneSzBlg2 == 1) {
3669      // 16x8
3670      // u1 == B{7..0}, u0 == A{7..0}
3671      // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3672      // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3673      assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3674      assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3675      return;
3676   }
3677   if (laneSzBlg2 == 0) {
3678      // 8x16
3679      // u1 == B{f..0}, u0 == A{f..0}
3680      // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3681      // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3682      assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3683      assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3684      return;
3685   }
3686   /*NOTREACHED*/
3687   vassert(0);
3688}
3689
3690
3691/* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3692static
3693void math_INTERLEAVE3_128(
3694        /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3695        UInt laneSzBlg2,
3696        IRTemp u0, IRTemp u1, IRTemp u2 )
3697{
3698   if (laneSzBlg2 == 3) {
3699      // 64x2
3700      // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3701      // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3702      assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3703      assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3704      assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3705      return;
3706   }
3707
3708   if (laneSzBlg2 == 2) {
3709      // 32x4
3710      // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3711      // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3712      // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3713      IRTemp p0    = newTempV128();
3714      IRTemp p1    = newTempV128();
3715      IRTemp p2    = newTempV128();
3716      IRTemp c1100 = newTempV128();
3717      IRTemp c0011 = newTempV128();
3718      IRTemp c0110 = newTempV128();
3719      assign(c1100, mkV128(0xFF00));
3720      assign(c0011, mkV128(0x00FF));
3721      assign(c0110, mkV128(0x0FF0));
3722      // First interleave them at 64x2 granularity,
3723      // generating partial ("p") values.
3724      math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3725      // And more shuffling around for the final answer
3726      assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3727                       AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3728      assign(*i1, OR3( SHL(EX(p2),12),
3729                       AND(EX(p1),EX(c0110)),
3730                       SHR(EX(p0),12) ));
3731      assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3732                       AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3733      return;
3734   }
3735
3736   if (laneSzBlg2 == 1) {
3737      // 16x8
3738      // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3739      // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3740      // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3741      //
3742      // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3743      // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3744      // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3745      //
3746      // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3747      // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3748      // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3749      IRTemp p0    = newTempV128();
3750      IRTemp p1    = newTempV128();
3751      IRTemp p2    = newTempV128();
3752      IRTemp c1000 = newTempV128();
3753      IRTemp c0100 = newTempV128();
3754      IRTemp c0010 = newTempV128();
3755      IRTemp c0001 = newTempV128();
3756      assign(c1000, mkV128(0xF000));
3757      assign(c0100, mkV128(0x0F00));
3758      assign(c0010, mkV128(0x00F0));
3759      assign(c0001, mkV128(0x000F));
3760      // First interleave them at 32x4 granularity,
3761      // generating partial ("p") values.
3762      math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3763      // And more shuffling around for the final answer
3764      assign(*i2,
3765             OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3766                  AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3767                  AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3768                  AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3769      ));
3770      assign(*i1,
3771             OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3772                  AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3773                  AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3774                  AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3775      ));
3776      assign(*i0,
3777             OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3778                  AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3779                  AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3780                  AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3781      ));
3782      return;
3783   }
3784
3785   if (laneSzBlg2 == 0) {
3786      // 8x16.  It doesn't seem worth the hassle of first doing a
3787      // 16x8 interleave, so just generate all 24 partial results
3788      // directly :-(
3789      // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3790      // i2 == Cf Bf Af Ce .. Bb Ab Ca
3791      // i1 == Ba Aa C9 B9 .. A6 C5 B5
3792      // i0 == A5 C4 B4 A4 .. C0 B0 A0
3793
3794      IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3795      IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3796      IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3797      IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3798      IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3799      IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3800      IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3801      IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3802      IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3803
3804      // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3805      // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3806      //
3807#     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3808         IRTemp t_##_tempName = newTempV128(); \
3809         assign(t_##_tempName, \
3810                ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3811                         ROR(EX(_srcVec2),(_srcShift2)) ) )
3812
3813      // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3814      IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3815
3816      // The slicing and reassembly are done as interleavedly as possible,
3817      // so as to minimise the demand for registers in the back end, which
3818      // was observed to be a problem in testing.
3819
3820      XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3821      XXXX(AfCe, AA, 0xf, CC, 0xe);
3822      assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3823
3824      XXXX(BeAe, BB, 0xe, AA, 0xe);
3825      XXXX(CdBd, CC, 0xd, BB, 0xd);
3826      assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3827      assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3828
3829      XXXX(AdCc, AA, 0xd, CC, 0xc);
3830      XXXX(BcAc, BB, 0xc, AA, 0xc);
3831      assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3832
3833      XXXX(CbBb, CC, 0xb, BB, 0xb);
3834      XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3835      assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3836      assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3837      assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3838
3839      XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3840      XXXX(C9B9, CC, 0x9, BB, 0x9);
3841      assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3842
3843      XXXX(A9C8, AA, 0x9, CC, 0x8);
3844      XXXX(B8A8, BB, 0x8, AA, 0x8);
3845      assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3846      assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3847
3848      XXXX(C7B7, CC, 0x7, BB, 0x7);
3849      XXXX(A7C6, AA, 0x7, CC, 0x6);
3850      assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3851
3852      XXXX(B6A6, BB, 0x6, AA, 0x6);
3853      XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3854      assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3855      assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3856      assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3857
3858      XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3859      XXXX(B4A4, BB, 0x4, AA, 0x4);
3860      assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
3861
3862      XXXX(C3B3, CC, 0x3, BB, 0x3);
3863      XXXX(A3C2, AA, 0x3, CC, 0x2);
3864      assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
3865      assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
3866
3867      XXXX(B2A2, BB, 0x2, AA, 0x2);
3868      XXXX(C1B1, CC, 0x1, BB, 0x1);
3869      assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
3870
3871      XXXX(A1C0, AA, 0x1, CC, 0x0);
3872      XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
3873      assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
3874      assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
3875      assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
3876
3877#     undef XXXX
3878      return;
3879   }
3880
3881   /*NOTREACHED*/
3882   vassert(0);
3883}
3884
3885
3886/* Do interleaving for 4 128 bit vectors, for ST4 insns. */
3887static
3888void math_INTERLEAVE4_128(
3889        /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
3890        UInt laneSzBlg2,
3891        IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
3892{
3893   if (laneSzBlg2 == 3) {
3894      // 64x2
3895      assign(*i0, ILO64x2(EX(u1), EX(u0)));
3896      assign(*i1, ILO64x2(EX(u3), EX(u2)));
3897      assign(*i2, IHI64x2(EX(u1), EX(u0)));
3898      assign(*i3, IHI64x2(EX(u3), EX(u2)));
3899      return;
3900   }
3901   if (laneSzBlg2 == 2) {
3902      // 32x4
3903      // First, interleave at the 64-bit lane size.
3904      IRTemp p0 = newTempV128();
3905      IRTemp p1 = newTempV128();
3906      IRTemp p2 = newTempV128();
3907      IRTemp p3 = newTempV128();
3908      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
3909      // And interleave (cat) at the 32 bit size.
3910      assign(*i0, CEV32x4(EX(p1), EX(p0)));
3911      assign(*i1, COD32x4(EX(p1), EX(p0)));
3912      assign(*i2, CEV32x4(EX(p3), EX(p2)));
3913      assign(*i3, COD32x4(EX(p3), EX(p2)));
3914      return;
3915   }
3916   if (laneSzBlg2 == 1) {
3917      // 16x8
3918      // First, interleave at the 32-bit lane size.
3919      IRTemp p0 = newTempV128();
3920      IRTemp p1 = newTempV128();
3921      IRTemp p2 = newTempV128();
3922      IRTemp p3 = newTempV128();
3923      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
3924      // And rearrange within each vector, to get the right 16 bit lanes.
3925      assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
3926      assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
3927      assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
3928      assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
3929      return;
3930   }
3931   if (laneSzBlg2 == 0) {
3932      // 8x16
3933      // First, interleave at the 16-bit lane size.
3934      IRTemp p0 = newTempV128();
3935      IRTemp p1 = newTempV128();
3936      IRTemp p2 = newTempV128();
3937      IRTemp p3 = newTempV128();
3938      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
3939      // And rearrange within each vector, to get the right 8 bit lanes.
3940      assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
3941      assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
3942      assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
3943      assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
3944      return;
3945   }
3946   /*NOTREACHED*/
3947   vassert(0);
3948}
3949
3950
3951/* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
3952static
3953void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
3954                             UInt laneSzBlg2, IRTemp i0 )
3955{
3956   assign(*u0, mkexpr(i0));
3957}
3958
3959
3960/* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
3961static
3962void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
3963                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
3964{
3965   /* This is pretty easy, since we have primitives directly to
3966      hand. */
3967   if (laneSzBlg2 == 3) {
3968      // 64x2
3969      // i1 == B1 A1, i0 == B0 A0
3970      // u1 == B1 B0, u0 == A1 A0
3971      assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
3972      assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
3973      return;
3974   }
3975   if (laneSzBlg2 == 2) {
3976      // 32x4
3977      // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3978      // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3979      assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
3980      assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
3981      return;
3982   }
3983   if (laneSzBlg2 == 1) {
3984      // 16x8
3985      // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3986      // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3987      // u1 == B{7..0}, u0 == A{7..0}
3988      assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
3989      assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
3990      return;
3991   }
3992   if (laneSzBlg2 == 0) {
3993      // 8x16
3994      // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3995      // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3996      // u1 == B{f..0}, u0 == A{f..0}
3997      assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
3998      assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
3999      return;
4000   }
4001   /*NOTREACHED*/
4002   vassert(0);
4003}
4004
4005
4006/* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4007static
4008void math_DEINTERLEAVE3_128(
4009        /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4010        UInt laneSzBlg2,
4011        IRTemp i0, IRTemp i1, IRTemp i2 )
4012{
4013   if (laneSzBlg2 == 3) {
4014      // 64x2
4015      // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4016      // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4017      assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4018      assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4019      assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4020      return;
4021   }
4022
4023   if (laneSzBlg2 == 2) {
4024      // 32x4
4025      // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4026      // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4027      // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4028      IRTemp t_a1c0b0a0 = newTempV128();
4029      IRTemp t_a2c1b1a1 = newTempV128();
4030      IRTemp t_a3c2b2a2 = newTempV128();
4031      IRTemp t_a0c3b3a3 = newTempV128();
4032      IRTemp p0 = newTempV128();
4033      IRTemp p1 = newTempV128();
4034      IRTemp p2 = newTempV128();
4035      // Compute some intermediate values.
4036      assign(t_a1c0b0a0, EX(i0));
4037      assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4038      assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4039      assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4040      // First deinterleave into lane-pairs
4041      assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4042      assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4043                         IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4044      assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4045      // Then deinterleave at 64x2 granularity.
4046      math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4047      return;
4048   }
4049
4050   if (laneSzBlg2 == 1) {
4051      // 16x8
4052      // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4053      // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4054      // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4055      //
4056      // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4057      // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4058      // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4059      //
4060      // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4061      // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4062      // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4063
4064      IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4065      s0 = s1 = s2 = s3
4066         = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4067      newTempsV128_4(&s0, &s1, &s2, &s3);
4068      newTempsV128_4(&t0, &t1, &t2, &t3);
4069      newTempsV128_4(&p0, &p1, &p2, &c00111111);
4070
4071      // s0 == b2a2 c1b1a1 c0b0a0
4072      // s1 == b4a4 c3b3c3 c2b2a2
4073      // s2 == b6a6 c5b5a5 c4b4a4
4074      // s3 == b0a0 c7b7a7 c6b6a6
4075      assign(s0, EX(i0));
4076      assign(s1, SL(EX(i1),EX(i0),6*2));
4077      assign(s2, SL(EX(i2),EX(i1),4*2));
4078      assign(s3, SL(EX(i0),EX(i2),2*2));
4079
4080      // t0 == 0 0 c1c0 b1b0 a1a0
4081      // t1 == 0 0 c3c2 b3b2 a3a2
4082      // t2 == 0 0 c5c4 b5b4 a5a4
4083      // t3 == 0 0 c7c6 b7b6 a7a6
4084      assign(c00111111, mkV128(0x0FFF));
4085      assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4086      assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4087      assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4088      assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4089
4090      assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4091      assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4092      assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4093
4094      // Then deinterleave at 32x4 granularity.
4095      math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4096      return;
4097   }
4098
4099   if (laneSzBlg2 == 0) {
4100      // 8x16.  This is the same scheme as for 16x8, with twice the
4101      // number of intermediate values.
4102      //
4103      // u2 == C{f..0}
4104      // u1 == B{f..0}
4105      // u0 == A{f..0}
4106      //
4107      // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4108      // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4109      // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4110      //
4111      // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4112      // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4113      // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4114      //
4115      IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4116             t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4117      s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4118         = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4119         = IRTemp_INVALID;
4120      newTempsV128_4(&s0, &s1, &s2, &s3);
4121      newTempsV128_4(&s4, &s5, &s6, &s7);
4122      newTempsV128_4(&t0, &t1, &t2, &t3);
4123      newTempsV128_4(&t4, &t5, &t6, &t7);
4124      newTempsV128_4(&p0, &p1, &p2, &cMASK);
4125
4126      // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4127      // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4128      // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4129      // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4130      // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4131      // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4132      // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4133      // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4134      assign(s0, SL(EX(i1),EX(i0), 0));
4135      assign(s1, SL(EX(i1),EX(i0), 6));
4136      assign(s2, SL(EX(i1),EX(i0),12));
4137      assign(s3, SL(EX(i2),EX(i1), 2));
4138      assign(s4, SL(EX(i2),EX(i1), 8));
4139      assign(s5, SL(EX(i2),EX(i1),14));
4140      assign(s6, SL(EX(i0),EX(i2), 4));
4141      assign(s7, SL(EX(i0),EX(i2),10));
4142
4143      // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4144      // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4145      // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4146      // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4147      // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4148      // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4149      // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4150      // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4151      assign(cMASK, mkV128(0x003F));
4152      assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4153      assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4154      assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4155      assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4156      assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4157      assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4158      assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4159      assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4160
4161      assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4162      assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4163                 SHL(EX(t3),2), SHR(EX(t2),4) ));
4164      assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4165
4166      // Then deinterleave at 16x8 granularity.
4167      math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4168      return;
4169   }
4170
4171   /*NOTREACHED*/
4172   vassert(0);
4173}
4174
4175
4176/* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4177static
4178void math_DEINTERLEAVE4_128(
4179        /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4180        UInt laneSzBlg2,
4181        IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4182{
4183   if (laneSzBlg2 == 3) {
4184      // 64x2
4185      assign(*u0, ILO64x2(EX(i2), EX(i0)));
4186      assign(*u1, IHI64x2(EX(i2), EX(i0)));
4187      assign(*u2, ILO64x2(EX(i3), EX(i1)));
4188      assign(*u3, IHI64x2(EX(i3), EX(i1)));
4189      return;
4190   }
4191   if (laneSzBlg2 == 2) {
4192      // 32x4
4193      IRTemp p0 = newTempV128();
4194      IRTemp p2 = newTempV128();
4195      IRTemp p1 = newTempV128();
4196      IRTemp p3 = newTempV128();
4197      assign(p0, ILO32x4(EX(i1), EX(i0)));
4198      assign(p1, IHI32x4(EX(i1), EX(i0)));
4199      assign(p2, ILO32x4(EX(i3), EX(i2)));
4200      assign(p3, IHI32x4(EX(i3), EX(i2)));
4201      // And now do what we did for the 64-bit case.
4202      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4203      return;
4204   }
4205   if (laneSzBlg2 == 1) {
4206      // 16x8
4207      // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4208      IRTemp p0 = newTempV128();
4209      IRTemp p1 = newTempV128();
4210      IRTemp p2 = newTempV128();
4211      IRTemp p3 = newTempV128();
4212      assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4213      assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4214      assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4215      assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4216      // From here on is like the 32 bit case.
4217      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4218      return;
4219   }
4220   if (laneSzBlg2 == 0) {
4221      // 8x16
4222      // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4223      IRTemp p0 = newTempV128();
4224      IRTemp p1 = newTempV128();
4225      IRTemp p2 = newTempV128();
4226      IRTemp p3 = newTempV128();
4227      assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4228                          ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4229      assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4230                          ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4231      assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4232                          ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4233      assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4234                          ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4235      // From here on is like the 16 bit case.
4236      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4237      return;
4238   }
4239   /*NOTREACHED*/
4240   vassert(0);
4241}
4242
4243
4244/* Wrappers that use the full-width (de)interleavers to do half-width
4245   (de)interleaving.  The scheme is to clone each input lane in the
4246   lower half of each incoming value, do a full width (de)interleave
4247   at the next lane size up, and remove every other lane of the the
4248   result.  The returned values may have any old junk in the upper
4249   64 bits -- the caller must ignore that. */
4250
4251/* Helper function -- get doubling and narrowing operations. */
4252static
4253void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4254                                   /*OUT*/IROp* halver,
4255                                   UInt laneSzBlg2 )
4256{
4257   switch (laneSzBlg2) {
4258      case 2:
4259         *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4260         break;
4261      case 1:
4262         *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4263         break;
4264      case 0:
4265         *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4266         break;
4267      default:
4268         vassert(0);
4269   }
4270}
4271
4272/* Do interleaving for 1 64 bit vector, for ST1 insns. */
4273static
4274void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4275                          UInt laneSzBlg2, IRTemp u0 )
4276{
4277   assign(*i0, mkexpr(u0));
4278}
4279
4280
4281/* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4282static
4283void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4284                          UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4285{
4286   if (laneSzBlg2 == 3) {
4287      // 1x64, degenerate case
4288      assign(*i0, EX(u0));
4289      assign(*i1, EX(u1));
4290      return;
4291   }
4292
4293   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4294   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4295   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4296
4297   IRTemp du0 = newTempV128();
4298   IRTemp du1 = newTempV128();
4299   assign(du0, binop(doubler, EX(u0), EX(u0)));
4300   assign(du1, binop(doubler, EX(u1), EX(u1)));
4301   IRTemp di0 = newTempV128();
4302   IRTemp di1 = newTempV128();
4303   math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4304   assign(*i0, binop(halver, EX(di0), EX(di0)));
4305   assign(*i1, binop(halver, EX(di1), EX(di1)));
4306}
4307
4308
4309/* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4310static
4311void math_INTERLEAVE3_64(
4312        /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4313        UInt laneSzBlg2,
4314        IRTemp u0, IRTemp u1, IRTemp u2 )
4315{
4316   if (laneSzBlg2 == 3) {
4317      // 1x64, degenerate case
4318      assign(*i0, EX(u0));
4319      assign(*i1, EX(u1));
4320      assign(*i2, EX(u2));
4321      return;
4322   }
4323
4324   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4325   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4326   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4327
4328   IRTemp du0 = newTempV128();
4329   IRTemp du1 = newTempV128();
4330   IRTemp du2 = newTempV128();
4331   assign(du0, binop(doubler, EX(u0), EX(u0)));
4332   assign(du1, binop(doubler, EX(u1), EX(u1)));
4333   assign(du2, binop(doubler, EX(u2), EX(u2)));
4334   IRTemp di0 = newTempV128();
4335   IRTemp di1 = newTempV128();
4336   IRTemp di2 = newTempV128();
4337   math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4338   assign(*i0, binop(halver, EX(di0), EX(di0)));
4339   assign(*i1, binop(halver, EX(di1), EX(di1)));
4340   assign(*i2, binop(halver, EX(di2), EX(di2)));
4341}
4342
4343
4344/* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4345static
4346void math_INTERLEAVE4_64(
4347        /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4348        UInt laneSzBlg2,
4349        IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4350{
4351   if (laneSzBlg2 == 3) {
4352      // 1x64, degenerate case
4353      assign(*i0, EX(u0));
4354      assign(*i1, EX(u1));
4355      assign(*i2, EX(u2));
4356      assign(*i3, EX(u3));
4357      return;
4358   }
4359
4360   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4361   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4362   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4363
4364   IRTemp du0 = newTempV128();
4365   IRTemp du1 = newTempV128();
4366   IRTemp du2 = newTempV128();
4367   IRTemp du3 = newTempV128();
4368   assign(du0, binop(doubler, EX(u0), EX(u0)));
4369   assign(du1, binop(doubler, EX(u1), EX(u1)));
4370   assign(du2, binop(doubler, EX(u2), EX(u2)));
4371   assign(du3, binop(doubler, EX(u3), EX(u3)));
4372   IRTemp di0 = newTempV128();
4373   IRTemp di1 = newTempV128();
4374   IRTemp di2 = newTempV128();
4375   IRTemp di3 = newTempV128();
4376   math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4377                        laneSzBlg2 + 1, du0, du1, du2, du3);
4378   assign(*i0, binop(halver, EX(di0), EX(di0)));
4379   assign(*i1, binop(halver, EX(di1), EX(di1)));
4380   assign(*i2, binop(halver, EX(di2), EX(di2)));
4381   assign(*i3, binop(halver, EX(di3), EX(di3)));
4382}
4383
4384
4385/* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4386static
4387void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4388                            UInt laneSzBlg2, IRTemp i0 )
4389{
4390   assign(*u0, mkexpr(i0));
4391}
4392
4393
4394/* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4395static
4396void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4397                            UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4398{
4399   if (laneSzBlg2 == 3) {
4400      // 1x64, degenerate case
4401      assign(*u0, EX(i0));
4402      assign(*u1, EX(i1));
4403      return;
4404   }
4405
4406   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4407   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4408   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4409
4410   IRTemp di0 = newTempV128();
4411   IRTemp di1 = newTempV128();
4412   assign(di0, binop(doubler, EX(i0), EX(i0)));
4413   assign(di1, binop(doubler, EX(i1), EX(i1)));
4414
4415   IRTemp du0 = newTempV128();
4416   IRTemp du1 = newTempV128();
4417   math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4418   assign(*u0, binop(halver, EX(du0), EX(du0)));
4419   assign(*u1, binop(halver, EX(du1), EX(du1)));
4420}
4421
4422
4423/* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4424static
4425void math_DEINTERLEAVE3_64(
4426        /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4427        UInt laneSzBlg2,
4428        IRTemp i0, IRTemp i1, IRTemp i2 )
4429{
4430   if (laneSzBlg2 == 3) {
4431      // 1x64, degenerate case
4432      assign(*u0, EX(i0));
4433      assign(*u1, EX(i1));
4434      assign(*u2, EX(i2));
4435      return;
4436   }
4437
4438   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4439   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4440   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4441
4442   IRTemp di0 = newTempV128();
4443   IRTemp di1 = newTempV128();
4444   IRTemp di2 = newTempV128();
4445   assign(di0, binop(doubler, EX(i0), EX(i0)));
4446   assign(di1, binop(doubler, EX(i1), EX(i1)));
4447   assign(di2, binop(doubler, EX(i2), EX(i2)));
4448   IRTemp du0 = newTempV128();
4449   IRTemp du1 = newTempV128();
4450   IRTemp du2 = newTempV128();
4451   math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4452   assign(*u0, binop(halver, EX(du0), EX(du0)));
4453   assign(*u1, binop(halver, EX(du1), EX(du1)));
4454   assign(*u2, binop(halver, EX(du2), EX(du2)));
4455}
4456
4457
4458/* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4459static
4460void math_DEINTERLEAVE4_64(
4461        /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4462        UInt laneSzBlg2,
4463        IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4464{
4465   if (laneSzBlg2 == 3) {
4466      // 1x64, degenerate case
4467      assign(*u0, EX(i0));
4468      assign(*u1, EX(i1));
4469      assign(*u2, EX(i2));
4470      assign(*u3, EX(i3));
4471      return;
4472   }
4473
4474   vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4475   IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4476   math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4477
4478   IRTemp di0 = newTempV128();
4479   IRTemp di1 = newTempV128();
4480   IRTemp di2 = newTempV128();
4481   IRTemp di3 = newTempV128();
4482   assign(di0, binop(doubler, EX(i0), EX(i0)));
4483   assign(di1, binop(doubler, EX(i1), EX(i1)));
4484   assign(di2, binop(doubler, EX(i2), EX(i2)));
4485   assign(di3, binop(doubler, EX(i3), EX(i3)));
4486   IRTemp du0 = newTempV128();
4487   IRTemp du1 = newTempV128();
4488   IRTemp du2 = newTempV128();
4489   IRTemp du3 = newTempV128();
4490   math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4491                          laneSzBlg2 + 1, di0, di1, di2, di3);
4492   assign(*u0, binop(halver, EX(du0), EX(du0)));
4493   assign(*u1, binop(halver, EX(du1), EX(du1)));
4494   assign(*u2, binop(halver, EX(du2), EX(du2)));
4495   assign(*u3, binop(halver, EX(du3), EX(du3)));
4496}
4497
4498
4499#undef EX
4500#undef SL
4501#undef ROR
4502#undef ROL
4503#undef SHR
4504#undef SHL
4505#undef ILO64x2
4506#undef IHI64x2
4507#undef ILO32x4
4508#undef IHI32x4
4509#undef ILO16x8
4510#undef IHI16x8
4511#undef ILO16x8
4512#undef IHI16x8
4513#undef CEV32x4
4514#undef COD32x4
4515#undef COD16x8
4516#undef COD8x16
4517#undef CEV8x16
4518#undef AND
4519#undef OR2
4520#undef OR3
4521#undef OR4
4522
4523
4524/*------------------------------------------------------------*/
4525/*--- Load and Store instructions                          ---*/
4526/*------------------------------------------------------------*/
4527
4528/* Generate the EA for a "reg + reg" style amode.  This is done from
4529   parts of the insn, but for sanity checking sake it takes the whole
4530   insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4531   and S=insn[12]:
4532
4533   The possible forms, along with their opt:S values, are:
4534      011:0   Xn|SP + Xm
4535      111:0   Xn|SP + Xm
4536      011:1   Xn|SP + Xm * transfer_szB
4537      111:1   Xn|SP + Xm * transfer_szB
4538      010:0   Xn|SP + 32Uto64(Wm)
4539      010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4540      110:0   Xn|SP + 32Sto64(Wm)
4541      110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4542
4543   Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4544   the transfer size is insn[23,31,30].  For integer loads/stores,
4545   insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4546
4547   If the decoding fails, it returns IRTemp_INVALID.
4548
4549   isInt is True iff this is decoding is for transfers to/from integer
4550   registers.  If False it is for transfers to/from vector registers.
4551*/
4552static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4553{
4554   UInt    optS  = SLICE_UInt(insn, 15, 12);
4555   UInt    mm    = SLICE_UInt(insn, 20, 16);
4556   UInt    nn    = SLICE_UInt(insn, 9, 5);
4557   UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4558                   | SLICE_UInt(insn, 31, 30); // Log2 of the size
4559
4560   buf[0] = 0;
4561
4562   /* Sanity checks, that this really is a load/store insn. */
4563   if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4564      goto fail;
4565
4566   if (isInt
4567       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4568       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4569       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4570       && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4571      goto fail;
4572
4573   if (!isInt
4574       && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4575      goto fail;
4576
4577   /* Throw out non-verified but possibly valid cases. */
4578   switch (szLg2) {
4579      case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4580      case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4581      case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4582      case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4583      case BITS3(1,0,0): // can only ever be valid for the vector case
4584                         if (isInt) goto fail; else break;
4585      case BITS3(1,0,1): // these sizes are never valid
4586      case BITS3(1,1,0):
4587      case BITS3(1,1,1): goto fail;
4588
4589      default: vassert(0);
4590   }
4591
4592   IRExpr* rhs  = NULL;
4593   switch (optS) {
4594      case BITS4(1,1,1,0): goto fail; //ATC
4595      case BITS4(0,1,1,0):
4596         rhs = getIReg64orZR(mm);
4597         vex_sprintf(buf, "[%s, %s]",
4598                     nameIReg64orZR(nn), nameIReg64orZR(mm));
4599         break;
4600      case BITS4(1,1,1,1): goto fail; //ATC
4601      case BITS4(0,1,1,1):
4602         rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4603         vex_sprintf(buf, "[%s, %s lsl %u]",
4604                     nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4605         break;
4606      case BITS4(0,1,0,0):
4607         rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4608         vex_sprintf(buf, "[%s, %s uxtx]",
4609                     nameIReg64orZR(nn), nameIReg32orZR(mm));
4610         break;
4611      case BITS4(0,1,0,1):
4612         rhs = binop(Iop_Shl64,
4613                     unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4614         vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4615                     nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4616         break;
4617      case BITS4(1,1,0,0):
4618         rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4619         vex_sprintf(buf, "[%s, %s sxtx]",
4620                     nameIReg64orZR(nn), nameIReg32orZR(mm));
4621         break;
4622      case BITS4(1,1,0,1):
4623         rhs = binop(Iop_Shl64,
4624                     unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4625         vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4626                     nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4627         break;
4628      default:
4629         /* The rest appear to be genuinely invalid */
4630         goto fail;
4631   }
4632
4633   vassert(rhs);
4634   IRTemp res = newTemp(Ity_I64);
4635   assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4636   return res;
4637
4638  fail:
4639   vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4640   return IRTemp_INVALID;
4641}
4642
4643
4644/* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4645   bits of DATAE :: Ity_I64. */
4646static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4647{
4648   IRExpr* addrE = mkexpr(addr);
4649   switch (szB) {
4650      case 8:
4651         storeLE(addrE, dataE);
4652         break;
4653      case 4:
4654         storeLE(addrE, unop(Iop_64to32, dataE));
4655         break;
4656      case 2:
4657         storeLE(addrE, unop(Iop_64to16, dataE));
4658         break;
4659      case 1:
4660         storeLE(addrE, unop(Iop_64to8, dataE));
4661         break;
4662      default:
4663         vassert(0);
4664   }
4665}
4666
4667
4668/* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4669   placing the result in an Ity_I64 temporary. */
4670static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4671{
4672   IRTemp  res   = newTemp(Ity_I64);
4673   IRExpr* addrE = mkexpr(addr);
4674   switch (szB) {
4675      case 8:
4676         assign(res, loadLE(Ity_I64,addrE));
4677         break;
4678      case 4:
4679         assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4680         break;
4681      case 2:
4682         assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4683         break;
4684      case 1:
4685         assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4686         break;
4687      default:
4688         vassert(0);
4689   }
4690   return res;
4691}
4692
4693
4694/* Generate a "standard 7" name, from bitQ and size.  But also
4695   allow ".1d" since that's occasionally useful. */
4696static
4697const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4698{
4699   vassert(bitQ <= 1 && size <= 3);
4700   const HChar* nms[8]
4701      = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4702   UInt ix = (bitQ << 2) | size;
4703   vassert(ix < 8);
4704   return nms[ix];
4705}
4706
4707
4708static
4709Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4710                          const VexAbiInfo* abiinfo
4711)
4712{
4713#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4714
4715   /* ------------ LDR,STR (immediate, uimm12) ----------- */
4716   /* uimm12 is scaled by the transfer size
4717
4718      31 29  26    21    9  4
4719      |  |   |     |     |  |
4720      11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4721      11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4722
4723      10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4724      10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4725
4726      01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4727      01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4728
4729      00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4730      00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4731   */
4732   if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4733      UInt   szLg2 = INSN(31,30);
4734      UInt   szB   = 1 << szLg2;
4735      Bool   isLD  = INSN(22,22) == 1;
4736      UInt   offs  = INSN(21,10) * szB;
4737      UInt   nn    = INSN(9,5);
4738      UInt   tt    = INSN(4,0);
4739      IRTemp ta    = newTemp(Ity_I64);
4740      assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4741      if (nn == 31) { /* FIXME generate stack alignment check */ }
4742      vassert(szLg2 < 4);
4743      if (isLD) {
4744         putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4745      } else {
4746         gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4747      }
4748      const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4749      const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4750      DIP("%s %s, [%s, #%u]\n",
4751          (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4752          nameIReg64orSP(nn), offs);
4753      return True;
4754   }
4755
4756   /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4757   /*
4758      31 29  26      20   11 9  4
4759      |  |   |       |    |  |  |
4760      (at-Rn-then-Rn=EA)  |  |  |
4761      sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4762      sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4763
4764      (at-EA-then-Rn=EA)
4765      sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4766      sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4767
4768      (at-EA)
4769      sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4770      sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4771
4772      simm9 is unscaled.
4773
4774      The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4775      load case this is because would create two competing values for
4776      Rt.  In the store case the reason is unclear, but the spec
4777      disallows it anyway.
4778
4779      Stores are narrowing, loads are unsigned widening.  sz encodes
4780      the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4781   */
4782   if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4783       == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4784      UInt szLg2  = INSN(31,30);
4785      UInt szB    = 1 << szLg2;
4786      Bool isLoad = INSN(22,22) == 1;
4787      UInt imm9   = INSN(20,12);
4788      UInt nn     = INSN(9,5);
4789      UInt tt     = INSN(4,0);
4790      Bool wBack  = INSN(10,10) == 1;
4791      UInt how    = INSN(11,10);
4792      if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4793         /* undecodable; fall through */
4794      } else {
4795         if (nn == 31) { /* FIXME generate stack alignment check */ }
4796
4797         // Compute the transfer address TA and the writeback address WA.
4798         IRTemp tRN = newTemp(Ity_I64);
4799         assign(tRN, getIReg64orSP(nn));
4800         IRTemp tEA = newTemp(Ity_I64);
4801         Long simm9 = (Long)sx_to_64(imm9, 9);
4802         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4803
4804         IRTemp tTA = newTemp(Ity_I64);
4805         IRTemp tWA = newTemp(Ity_I64);
4806         switch (how) {
4807            case BITS2(0,1):
4808               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4809            case BITS2(1,1):
4810               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4811            case BITS2(0,0):
4812               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4813            default:
4814               vassert(0); /* NOTREACHED */
4815         }
4816
4817         /* Normally rN would be updated after the transfer.  However, in
4818            the special case typifed by
4819               str x30, [sp,#-16]!
4820            it is necessary to update SP before the transfer, (1)
4821            because Memcheck will otherwise complain about a write
4822            below the stack pointer, and (2) because the segfault
4823            stack extension mechanism will otherwise extend the stack
4824            only down to SP before the instruction, which might not be
4825            far enough, if the -16 bit takes the actual access
4826            address to the next page.
4827         */
4828         Bool earlyWBack
4829           = wBack && simm9 < 0 && szB == 8
4830             && how == BITS2(1,1) && nn == 31 && !isLoad && tt != nn;
4831
4832         if (wBack && earlyWBack)
4833            putIReg64orSP(nn, mkexpr(tEA));
4834
4835         if (isLoad) {
4836            putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
4837         } else {
4838            gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
4839         }
4840
4841         if (wBack && !earlyWBack)
4842            putIReg64orSP(nn, mkexpr(tEA));
4843
4844         const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
4845         const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
4846         const HChar* fmt_str = NULL;
4847         switch (how) {
4848            case BITS2(0,1):
4849               fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4850               break;
4851            case BITS2(1,1):
4852               fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4853               break;
4854            case BITS2(0,0):
4855               fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
4856               break;
4857            default:
4858               vassert(0);
4859         }
4860         DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
4861                      nameIRegOrZR(szB == 8, tt),
4862                      nameIReg64orSP(nn), simm9);
4863         return True;
4864      }
4865   }
4866
4867   /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
4868   /* L==1 => mm==LD
4869      L==0 => mm==ST
4870      x==0 => 32 bit transfers, and zero extended loads
4871      x==1 => 64 bit transfers
4872      simm7 is scaled by the (single-register) transfer size
4873
4874      (at-Rn-then-Rn=EA)
4875      x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
4876
4877      (at-EA-then-Rn=EA)
4878      x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
4879
4880      (at-EA)
4881      x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
4882   */
4883   UInt insn_30_23 = INSN(30,23);
4884   if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
4885       || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
4886       || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
4887      UInt bL     = INSN(22,22);
4888      UInt bX     = INSN(31,31);
4889      UInt bWBack = INSN(23,23);
4890      UInt rT1    = INSN(4,0);
4891      UInt rN     = INSN(9,5);
4892      UInt rT2    = INSN(14,10);
4893      Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
4894      if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
4895          || (bL && rT1 == rT2)) {
4896         /* undecodable; fall through */
4897      } else {
4898         if (rN == 31) { /* FIXME generate stack alignment check */ }
4899
4900         // Compute the transfer address TA and the writeback address WA.
4901         IRTemp tRN = newTemp(Ity_I64);
4902         assign(tRN, getIReg64orSP(rN));
4903         IRTemp tEA = newTemp(Ity_I64);
4904         simm7 = (bX ? 8 : 4) * simm7;
4905         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
4906
4907         IRTemp tTA = newTemp(Ity_I64);
4908         IRTemp tWA = newTemp(Ity_I64);
4909         switch (INSN(24,23)) {
4910            case BITS2(0,1):
4911               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4912            case BITS2(1,1):
4913               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4914            case BITS2(1,0):
4915               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4916            default:
4917               vassert(0); /* NOTREACHED */
4918         }
4919
4920         /* Normally rN would be updated after the transfer.  However, in
4921            the special case typifed by
4922               stp x29, x30, [sp,#-112]!
4923            it is necessary to update SP before the transfer, (1)
4924            because Memcheck will otherwise complain about a write
4925            below the stack pointer, and (2) because the segfault
4926            stack extension mechanism will otherwise extend the stack
4927            only down to SP before the instruction, which might not be
4928            far enough, if the -112 bit takes the actual access
4929            address to the next page.
4930         */
4931         Bool earlyWBack
4932           = bWBack && simm7 < 0
4933             && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
4934
4935         if (bWBack && earlyWBack)
4936            putIReg64orSP(rN, mkexpr(tEA));
4937
4938         /**/ if (bL == 1 && bX == 1) {
4939            // 64 bit load
4940            putIReg64orZR(rT1, loadLE(Ity_I64,
4941                                      binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4942            putIReg64orZR(rT2, loadLE(Ity_I64,
4943                                      binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
4944         } else if (bL == 1 && bX == 0) {
4945            // 32 bit load
4946            putIReg32orZR(rT1, loadLE(Ity_I32,
4947                                      binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4948            putIReg32orZR(rT2, loadLE(Ity_I32,
4949                                      binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
4950         } else if (bL == 0 && bX == 1) {
4951            // 64 bit store
4952            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4953                    getIReg64orZR(rT1));
4954            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
4955                    getIReg64orZR(rT2));
4956         } else {
4957            vassert(bL == 0 && bX == 0);
4958            // 32 bit store
4959            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4960                    getIReg32orZR(rT1));
4961            storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
4962                    getIReg32orZR(rT2));
4963         }
4964
4965         if (bWBack && !earlyWBack)
4966            putIReg64orSP(rN, mkexpr(tEA));
4967
4968         const HChar* fmt_str = NULL;
4969         switch (INSN(24,23)) {
4970            case BITS2(0,1):
4971               fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4972               break;
4973            case BITS2(1,1):
4974               fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4975               break;
4976            case BITS2(1,0):
4977               fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
4978               break;
4979            default:
4980               vassert(0);
4981         }
4982         DIP(fmt_str, bL == 0 ? "st" : "ld",
4983                      nameIRegOrZR(bX == 1, rT1),
4984                      nameIRegOrZR(bX == 1, rT2),
4985                      nameIReg64orSP(rN), simm7);
4986         return True;
4987      }
4988   }
4989
4990   /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
4991   /* Does 32 bit transfers which are sign extended to 64 bits.
4992      simm7 is scaled by the (single-register) transfer size
4993
4994      (at-Rn-then-Rn=EA)
4995      01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
4996
4997      (at-EA-then-Rn=EA)
4998      01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
4999
5000      (at-EA)
5001      01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5002   */
5003   UInt insn_31_22 = INSN(31,22);
5004   if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5005       || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5006       || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5007      UInt bWBack = INSN(23,23);
5008      UInt rT1    = INSN(4,0);
5009      UInt rN     = INSN(9,5);
5010      UInt rT2    = INSN(14,10);
5011      Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5012      if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5013          || (rT1 == rT2)) {
5014         /* undecodable; fall through */
5015      } else {
5016         if (rN == 31) { /* FIXME generate stack alignment check */ }
5017
5018         // Compute the transfer address TA and the writeback address WA.
5019         IRTemp tRN = newTemp(Ity_I64);
5020         assign(tRN, getIReg64orSP(rN));
5021         IRTemp tEA = newTemp(Ity_I64);
5022         simm7 = 4 * simm7;
5023         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5024
5025         IRTemp tTA = newTemp(Ity_I64);
5026         IRTemp tWA = newTemp(Ity_I64);
5027         switch (INSN(24,23)) {
5028            case BITS2(0,1):
5029               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5030            case BITS2(1,1):
5031               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5032            case BITS2(1,0):
5033               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5034            default:
5035               vassert(0); /* NOTREACHED */
5036         }
5037
5038         // 32 bit load, sign extended to 64 bits
5039         putIReg64orZR(rT1, unop(Iop_32Sto64,
5040                                 loadLE(Ity_I32, binop(Iop_Add64,
5041                                                       mkexpr(tTA),
5042                                                       mkU64(0)))));
5043         putIReg64orZR(rT2, unop(Iop_32Sto64,
5044                                 loadLE(Ity_I32, binop(Iop_Add64,
5045                                                       mkexpr(tTA),
5046                                                       mkU64(4)))));
5047         if (bWBack)
5048            putIReg64orSP(rN, mkexpr(tEA));
5049
5050         const HChar* fmt_str = NULL;
5051         switch (INSN(24,23)) {
5052            case BITS2(0,1):
5053               fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5054               break;
5055            case BITS2(1,1):
5056               fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5057               break;
5058            case BITS2(1,0):
5059               fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5060               break;
5061            default:
5062               vassert(0);
5063         }
5064         DIP(fmt_str, nameIReg64orZR(rT1),
5065                      nameIReg64orZR(rT2),
5066                      nameIReg64orSP(rN), simm7);
5067         return True;
5068      }
5069   }
5070
5071   /* ---------------- LDR (literal, int reg) ---------------- */
5072   /* 31 29      23    4
5073      00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5074      01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5075      10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5076      11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5077      Just handles the first two cases for now.
5078   */
5079   if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5080      UInt  imm19 = INSN(23,5);
5081      UInt  rT    = INSN(4,0);
5082      UInt  bX    = INSN(30,30);
5083      ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5084      if (bX) {
5085         putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5086      } else {
5087         putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5088      }
5089      DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5090      return True;
5091   }
5092
5093   /* -------------- {LD,ST}R (integer register) --------------- */
5094   /* 31 29        20 15     12 11 9  4
5095      |  |         |  |      |  |  |  |
5096      11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5097      10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5098      01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5099      00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5100
5101      11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5102      10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5103      01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5104      00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5105   */
5106   if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5107       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5108      HChar  dis_buf[64];
5109      UInt   szLg2 = INSN(31,30);
5110      Bool   isLD  = INSN(22,22) == 1;
5111      UInt   tt    = INSN(4,0);
5112      IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5113      if (ea != IRTemp_INVALID) {
5114         switch (szLg2) {
5115            case 3: /* 64 bit */
5116               if (isLD) {
5117                  putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5118                  DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5119               } else {
5120                  storeLE(mkexpr(ea), getIReg64orZR(tt));
5121                  DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5122               }
5123               break;
5124            case 2: /* 32 bit */
5125               if (isLD) {
5126                  putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5127                  DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5128               } else {
5129                  storeLE(mkexpr(ea), getIReg32orZR(tt));
5130                  DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5131               }
5132               break;
5133            case 1: /* 16 bit */
5134               if (isLD) {
5135                  putIReg64orZR(tt, unop(Iop_16Uto64,
5136                                         loadLE(Ity_I16, mkexpr(ea))));
5137                  DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5138               } else {
5139                  storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5140                  DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5141               }
5142               break;
5143            case 0: /* 8 bit */
5144               if (isLD) {
5145                  putIReg64orZR(tt, unop(Iop_8Uto64,
5146                                         loadLE(Ity_I8, mkexpr(ea))));
5147                  DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5148               } else {
5149                  storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5150                  DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5151               }
5152               break;
5153            default:
5154               vassert(0);
5155         }
5156         return True;
5157      }
5158   }
5159
5160   /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5161   /* 31 29  26  23 21    9 4
5162      10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5163      01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5164      00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5165      where
5166         Rt is Wt when x==1, Xt when x==0
5167   */
5168   if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5169      /* Further checks on bits 31:30 and 22 */
5170      Bool valid = False;
5171      switch ((INSN(31,30) << 1) | INSN(22,22)) {
5172         case BITS3(1,0,0):
5173         case BITS3(0,1,0): case BITS3(0,1,1):
5174         case BITS3(0,0,0): case BITS3(0,0,1):
5175            valid = True;
5176            break;
5177      }
5178      if (valid) {
5179         UInt    szLg2 = INSN(31,30);
5180         UInt    bitX  = INSN(22,22);
5181         UInt    imm12 = INSN(21,10);
5182         UInt    nn    = INSN(9,5);
5183         UInt    tt    = INSN(4,0);
5184         UInt    szB   = 1 << szLg2;
5185         IRExpr* ea    = binop(Iop_Add64,
5186                               getIReg64orSP(nn), mkU64(imm12 * szB));
5187         switch (szB) {
5188            case 4:
5189               vassert(bitX == 0);
5190               putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5191               DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5192                   nameIReg64orSP(nn), imm12 * szB);
5193               break;
5194            case 2:
5195               if (bitX == 1) {
5196                  putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5197               } else {
5198                  putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5199               }
5200               DIP("ldrsh %s, [%s, #%u]\n",
5201                   nameIRegOrZR(bitX == 0, tt),
5202                   nameIReg64orSP(nn), imm12 * szB);
5203               break;
5204            case 1:
5205               if (bitX == 1) {
5206                  putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5207               } else {
5208                  putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5209               }
5210               DIP("ldrsb %s, [%s, #%u]\n",
5211                   nameIRegOrZR(bitX == 0, tt),
5212                   nameIReg64orSP(nn), imm12 * szB);
5213               break;
5214            default:
5215               vassert(0);
5216         }
5217         return True;
5218      }
5219      /* else fall through */
5220   }
5221
5222   /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5223   /* (at-Rn-then-Rn=EA)
5224      31 29      23 21 20   11 9 4
5225      00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5226      01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5227      10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5228
5229      (at-EA-then-Rn=EA)
5230      00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5231      01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5232      10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5233      where
5234         Rt is Wt when x==1, Xt when x==0
5235         transfer-at-Rn when [11]==0, at EA when [11]==1
5236   */
5237   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5238       && INSN(21,21) == 0 && INSN(10,10) == 1) {
5239      /* Further checks on bits 31:30 and 22 */
5240      Bool valid = False;
5241      switch ((INSN(31,30) << 1) | INSN(22,22)) {
5242         case BITS3(1,0,0):                    // LDRSW Xt
5243         case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5244         case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5245            valid = True;
5246            break;
5247      }
5248      if (valid) {
5249         UInt   szLg2 = INSN(31,30);
5250         UInt   imm9  = INSN(20,12);
5251         Bool   atRN  = INSN(11,11) == 0;
5252         UInt   nn    = INSN(9,5);
5253         UInt   tt    = INSN(4,0);
5254         IRTemp tRN   = newTemp(Ity_I64);
5255         IRTemp tEA   = newTemp(Ity_I64);
5256         IRTemp tTA   = IRTemp_INVALID;
5257         ULong  simm9 = sx_to_64(imm9, 9);
5258         Bool   is64  = INSN(22,22) == 0;
5259         assign(tRN, getIReg64orSP(nn));
5260         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5261         tTA = atRN ? tRN : tEA;
5262         HChar ch = '?';
5263         /* There are 5 cases:
5264               byte     load,           SX to 64
5265               byte     load, SX to 32, ZX to 64
5266               halfword load,           SX to 64
5267               halfword load, SX to 32, ZX to 64
5268               word     load,           SX to 64
5269            The ifs below handle them in the listed order.
5270         */
5271         if (szLg2 == 0) {
5272            ch = 'b';
5273            if (is64) {
5274               putIReg64orZR(tt, unop(Iop_8Sto64,
5275                                      loadLE(Ity_I8, mkexpr(tTA))));
5276            } else {
5277               putIReg32orZR(tt, unop(Iop_8Sto32,
5278                                      loadLE(Ity_I8, mkexpr(tTA))));
5279            }
5280         }
5281         else if (szLg2 == 1) {
5282            ch = 'h';
5283            if (is64) {
5284               putIReg64orZR(tt, unop(Iop_16Sto64,
5285                                      loadLE(Ity_I16, mkexpr(tTA))));
5286            } else {
5287               putIReg32orZR(tt, unop(Iop_16Sto32,
5288                                      loadLE(Ity_I16, mkexpr(tTA))));
5289            }
5290         }
5291         else if (szLg2 == 2 && is64) {
5292            ch = 'w';
5293            putIReg64orZR(tt, unop(Iop_32Sto64,
5294                                   loadLE(Ity_I32, mkexpr(tTA))));
5295         }
5296         else {
5297            vassert(0);
5298         }
5299         putIReg64orSP(nn, mkexpr(tEA));
5300         DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5301             ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5302         return True;
5303      }
5304      /* else fall through */
5305   }
5306
5307   /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5308   /* 31 29      23 21 20   11 9 4
5309      00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5310      01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5311      10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5312      where
5313         Rt is Wt when x==1, Xt when x==0
5314   */
5315   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5316       && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5317      /* Further checks on bits 31:30 and 22 */
5318      Bool valid = False;
5319      switch ((INSN(31,30) << 1) | INSN(22,22)) {
5320         case BITS3(1,0,0):                    // LDURSW Xt
5321         case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5322         case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5323            valid = True;
5324            break;
5325      }
5326      if (valid) {
5327         UInt   szLg2 = INSN(31,30);
5328         UInt   imm9  = INSN(20,12);
5329         UInt   nn    = INSN(9,5);
5330         UInt   tt    = INSN(4,0);
5331         IRTemp tRN   = newTemp(Ity_I64);
5332         IRTemp tEA   = newTemp(Ity_I64);
5333         ULong  simm9 = sx_to_64(imm9, 9);
5334         Bool   is64  = INSN(22,22) == 0;
5335         assign(tRN, getIReg64orSP(nn));
5336         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5337         HChar ch = '?';
5338         /* There are 5 cases:
5339               byte     load,           SX to 64
5340               byte     load, SX to 32, ZX to 64
5341               halfword load,           SX to 64
5342               halfword load, SX to 32, ZX to 64
5343               word     load,           SX to 64
5344            The ifs below handle them in the listed order.
5345         */
5346         if (szLg2 == 0) {
5347            ch = 'b';
5348            if (is64) {
5349               putIReg64orZR(tt, unop(Iop_8Sto64,
5350                                      loadLE(Ity_I8, mkexpr(tEA))));
5351            } else {
5352               putIReg32orZR(tt, unop(Iop_8Sto32,
5353                                      loadLE(Ity_I8, mkexpr(tEA))));
5354            }
5355         }
5356         else if (szLg2 == 1) {
5357            ch = 'h';
5358            if (is64) {
5359               putIReg64orZR(tt, unop(Iop_16Sto64,
5360                                      loadLE(Ity_I16, mkexpr(tEA))));
5361            } else {
5362               putIReg32orZR(tt, unop(Iop_16Sto32,
5363                                      loadLE(Ity_I16, mkexpr(tEA))));
5364            }
5365         }
5366         else if (szLg2 == 2 && is64) {
5367            ch = 'w';
5368            putIReg64orZR(tt, unop(Iop_32Sto64,
5369                                   loadLE(Ity_I32, mkexpr(tEA))));
5370         }
5371         else {
5372            vassert(0);
5373         }
5374         DIP("ldurs%c %s, [%s, #%lld]",
5375             ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5376         return True;
5377      }
5378      /* else fall through */
5379   }
5380
5381   /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5382   /* L==1    => mm==LD
5383      L==0    => mm==ST
5384      sz==00  => 32 bit (S) transfers
5385      sz==01  => 64 bit (D) transfers
5386      sz==10  => 128 bit (Q) transfers
5387      sz==11  isn't allowed
5388      simm7 is scaled by the (single-register) transfer size
5389
5390      31 29  26   22 21   14 9 4
5391
5392      sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5393                                    (at-EA, with nontemporal hint)
5394
5395      sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5396                                    (at-Rn-then-Rn=EA)
5397
5398      sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5399                                    (at-EA)
5400
5401      sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5402                                    (at-EA-then-Rn=EA)
5403   */
5404   if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5405      UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5406      Bool isLD   = INSN(22,22) == 1;
5407      Bool wBack  = INSN(23,23) == 1;
5408      Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5409      UInt tt2    = INSN(14,10);
5410      UInt nn     = INSN(9,5);
5411      UInt tt1    = INSN(4,0);
5412      if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5413         /* undecodable; fall through */
5414      } else {
5415         if (nn == 31) { /* FIXME generate stack alignment check */ }
5416
5417         // Compute the transfer address TA and the writeback address WA.
5418         UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5419         IRTemp tRN = newTemp(Ity_I64);
5420         assign(tRN, getIReg64orSP(nn));
5421         IRTemp tEA = newTemp(Ity_I64);
5422         simm7 = szB * simm7;
5423         assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5424
5425         IRTemp tTA = newTemp(Ity_I64);
5426         IRTemp tWA = newTemp(Ity_I64);
5427         switch (INSN(24,23)) {
5428            case BITS2(0,1):
5429               assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5430            case BITS2(1,1):
5431               assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5432            case BITS2(1,0):
5433            case BITS2(0,0):
5434               assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5435            default:
5436               vassert(0); /* NOTREACHED */
5437         }
5438
5439         IRType ty = Ity_INVALID;
5440         switch (szB) {
5441            case 4:  ty = Ity_F32;  break;
5442            case 8:  ty = Ity_F64;  break;
5443            case 16: ty = Ity_V128; break;
5444            default: vassert(0);
5445         }
5446
5447         /* Normally rN would be updated after the transfer.  However, in
5448            the special cases typifed by
5449               stp q0, q1, [sp,#-512]!
5450               stp d0, d1, [sp,#-512]!
5451               stp s0, s1, [sp,#-512]!
5452            it is necessary to update SP before the transfer, (1)
5453            because Memcheck will otherwise complain about a write
5454            below the stack pointer, and (2) because the segfault
5455            stack extension mechanism will otherwise extend the stack
5456            only down to SP before the instruction, which might not be
5457            far enough, if the -512 bit takes the actual access
5458            address to the next page.
5459         */
5460         Bool earlyWBack
5461           = wBack && simm7 < 0
5462             && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5463
5464         if (wBack && earlyWBack)
5465            putIReg64orSP(nn, mkexpr(tEA));
5466
5467         if (isLD) {
5468            if (szB < 16) {
5469               putQReg128(tt1, mkV128(0x0000));
5470            }
5471            putQRegLO(tt1,
5472                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5473            if (szB < 16) {
5474               putQReg128(tt2, mkV128(0x0000));
5475            }
5476            putQRegLO(tt2,
5477                      loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5478         } else {
5479            storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5480                    getQRegLO(tt1, ty));
5481            storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5482                    getQRegLO(tt2, ty));
5483         }
5484
5485         if (wBack && !earlyWBack)
5486            putIReg64orSP(nn, mkexpr(tEA));
5487
5488         const HChar* fmt_str = NULL;
5489         switch (INSN(24,23)) {
5490            case BITS2(0,1):
5491               fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5492               break;
5493            case BITS2(1,1):
5494               fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5495               break;
5496            case BITS2(1,0):
5497               fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5498               break;
5499            case BITS2(0,0):
5500               fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5501               break;
5502            default:
5503               vassert(0);
5504         }
5505         DIP(fmt_str, isLD ? "ld" : "st",
5506                      nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5507                      nameIReg64orSP(nn), simm7);
5508         return True;
5509      }
5510   }
5511
5512   /* -------------- {LD,ST}R (vector register) --------------- */
5513   /* 31 29     23  20 15     12 11 9  4
5514      |  |      |   |  |      |  |  |  |
5515      00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5516      01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5517      10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5518      11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5519      00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5520
5521      00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5522      01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5523      10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5524      11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5525      00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5526   */
5527   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5528       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5529      HChar  dis_buf[64];
5530      UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5531      Bool   isLD  = INSN(22,22) == 1;
5532      UInt   tt    = INSN(4,0);
5533      if (szLg2 > 4) goto after_LDR_STR_vector_register;
5534      IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5535      if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5536      switch (szLg2) {
5537         case 0: /* 8 bit */
5538            if (isLD) {
5539               putQReg128(tt, mkV128(0x0000));
5540               putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5541               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5542            } else {
5543               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5544               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5545            }
5546            break;
5547         case 1:
5548            if (isLD) {
5549               putQReg128(tt, mkV128(0x0000));
5550               putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5551               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5552            } else {
5553               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5554               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5555            }
5556            break;
5557         case 2: /* 32 bit */
5558            if (isLD) {
5559               putQReg128(tt, mkV128(0x0000));
5560               putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5561               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5562            } else {
5563               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5564               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5565            }
5566            break;
5567         case 3: /* 64 bit */
5568            if (isLD) {
5569               putQReg128(tt, mkV128(0x0000));
5570               putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5571               DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5572            } else {
5573               storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5574               DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5575            }
5576            break;
5577         case 4:
5578            if (isLD) {
5579               putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5580               DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5581            } else {
5582               storeLE(mkexpr(ea), getQReg128(tt));
5583               DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5584            }
5585            break;
5586         default:
5587            vassert(0);
5588      }
5589      return True;
5590   }
5591  after_LDR_STR_vector_register:
5592
5593   /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5594   /* 31 29      22 20 15  12 11 9  4
5595      |  |       |  |  |   |  |  |  |
5596      10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5597
5598      01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5599      01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5600
5601      00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5602      00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5603   */
5604   if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5605       && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5606      HChar  dis_buf[64];
5607      UInt   szLg2  = INSN(31,30);
5608      Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5609      UInt   tt     = INSN(4,0);
5610      if (szLg2 == 3) goto after_LDRS_integer_register;
5611      IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5612      if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5613      /* Enumerate the 5 variants explicitly. */
5614      if (szLg2 == 2/*32 bit*/ && sxTo64) {
5615         putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5616         DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5617         return True;
5618      }
5619      else
5620      if (szLg2 == 1/*16 bit*/) {
5621         if (sxTo64) {
5622            putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5623            DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5624         } else {
5625            putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5626            DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5627         }
5628         return True;
5629      }
5630      else
5631      if (szLg2 == 0/*8 bit*/) {
5632         if (sxTo64) {
5633            putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5634            DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5635         } else {
5636            putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5637            DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5638         }
5639         return True;
5640      }
5641      /* else it's an invalid combination */
5642   }
5643  after_LDRS_integer_register:
5644
5645   /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5646   /* This is the Unsigned offset variant only.  The Post-Index and
5647      Pre-Index variants are below.
5648
5649      31 29      23 21    9 4
5650      00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5651      01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5652      10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5653      11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5654      00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5655
5656      00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5657      01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5658      10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5659      11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5660      00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5661   */
5662   if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5663       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5664      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5665      Bool   isLD   = INSN(22,22) == 1;
5666      UInt   pimm12 = INSN(21,10) << szLg2;
5667      UInt   nn     = INSN(9,5);
5668      UInt   tt     = INSN(4,0);
5669      IRTemp tEA    = newTemp(Ity_I64);
5670      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5671      assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5672      if (isLD) {
5673         if (szLg2 < 4) {
5674            putQReg128(tt, mkV128(0x0000));
5675         }
5676         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5677      } else {
5678         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5679      }
5680      DIP("%s %s, [%s, #%u]\n",
5681          isLD ? "ldr" : "str",
5682          nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5683      return True;
5684   }
5685
5686   /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5687   /* These are the Post-Index and Pre-Index variants.
5688
5689      31 29      23   20   11 9 4
5690      (at-Rn-then-Rn=EA)
5691      00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5692      01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5693      10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5694      11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5695      00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5696
5697      (at-EA-then-Rn=EA)
5698      00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5699      01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5700      10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5701      11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5702      00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5703
5704      Stores are the same except with bit 22 set to 0.
5705   */
5706   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5707       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5708       && INSN(21,21) == 0 && INSN(10,10) == 1) {
5709      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5710      Bool   isLD   = INSN(22,22) == 1;
5711      UInt   imm9   = INSN(20,12);
5712      Bool   atRN   = INSN(11,11) == 0;
5713      UInt   nn     = INSN(9,5);
5714      UInt   tt     = INSN(4,0);
5715      IRTemp tRN    = newTemp(Ity_I64);
5716      IRTemp tEA    = newTemp(Ity_I64);
5717      IRTemp tTA    = IRTemp_INVALID;
5718      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5719      ULong  simm9  = sx_to_64(imm9, 9);
5720      assign(tRN, getIReg64orSP(nn));
5721      assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5722      tTA = atRN ? tRN : tEA;
5723      if (isLD) {
5724         if (szLg2 < 4) {
5725            putQReg128(tt, mkV128(0x0000));
5726         }
5727         putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5728      } else {
5729         storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5730      }
5731      putIReg64orSP(nn, mkexpr(tEA));
5732      DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5733          isLD ? "ldr" : "str",
5734          nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5735      return True;
5736   }
5737
5738   /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5739   /* 31 29      23   20   11 9 4
5740      00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5741      01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5742      10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5743      11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5744      00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5745
5746      00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5747      01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5748      10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5749      11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5750      00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5751   */
5752   if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5753       && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5754       && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5755      UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5756      Bool   isLD   = INSN(22,22) == 1;
5757      UInt   imm9   = INSN(20,12);
5758      UInt   nn     = INSN(9,5);
5759      UInt   tt     = INSN(4,0);
5760      ULong  simm9  = sx_to_64(imm9, 9);
5761      IRTemp tEA    = newTemp(Ity_I64);
5762      IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5763      assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5764      if (isLD) {
5765         if (szLg2 < 4) {
5766            putQReg128(tt, mkV128(0x0000));
5767         }
5768         putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5769      } else {
5770         storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5771      }
5772      DIP("%s %s, [%s, #%lld]\n",
5773          isLD ? "ldur" : "stur",
5774          nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5775      return True;
5776   }
5777
5778   /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5779   /* 31 29      23    4
5780      00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5781      01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5782      10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5783   */
5784   if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5785      UInt   szB   = 4 << INSN(31,30);
5786      UInt   imm19 = INSN(23,5);
5787      UInt   tt    = INSN(4,0);
5788      ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5789      IRType ty    = preferredVectorSubTypeFromSize(szB);
5790      putQReg128(tt, mkV128(0x0000));
5791      putQRegLO(tt, loadLE(ty, mkU64(ea)));
5792      DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5793      return True;
5794   }
5795
5796   /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5797   /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5798   /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5799   /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5800   /* 31 29  26   22 21 20    15   11 9 4
5801
5802      0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5803      0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5804
5805      0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5806      0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5807
5808      0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5809      0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5810
5811      0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
5812      0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
5813
5814      T    = defined by Q and sz in the normal way
5815      step = if m == 11111 then transfer-size else Xm
5816      xx   = case L of 1 -> LD ; 0 -> ST
5817   */
5818   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
5819       && INSN(21,21) == 0) {
5820      Bool bitQ  = INSN(30,30);
5821      Bool isPX  = INSN(23,23) == 1;
5822      Bool isLD  = INSN(22,22) == 1;
5823      UInt mm    = INSN(20,16);
5824      UInt opc   = INSN(15,12);
5825      UInt sz    = INSN(11,10);
5826      UInt nn    = INSN(9,5);
5827      UInt tt    = INSN(4,0);
5828      Bool isQ   = bitQ == 1;
5829      Bool is1d  = sz == BITS2(1,1) && !isQ;
5830      UInt nRegs = 0;
5831      switch (opc) {
5832         case BITS4(0,0,0,0): nRegs = 4; break;
5833         case BITS4(0,1,0,0): nRegs = 3; break;
5834         case BITS4(1,0,0,0): nRegs = 2; break;
5835         case BITS4(0,1,1,1): nRegs = 1; break;
5836         default: break;
5837      }
5838
5839      /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
5840         If we see it, set nRegs to 0 so as to cause the next conditional
5841         to fail. */
5842      if (!isPX && mm != 0)
5843         nRegs = 0;
5844
5845      if (nRegs == 1                             /* .1d is allowed */
5846          || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
5847
5848         UInt xferSzB = (isQ ? 16 : 8) * nRegs;
5849
5850         /* Generate the transfer address (TA) and if necessary the
5851            writeback address (WB) */
5852         IRTemp tTA = newTemp(Ity_I64);
5853         assign(tTA, getIReg64orSP(nn));
5854         if (nn == 31) { /* FIXME generate stack alignment check */ }
5855         IRTemp tWB = IRTemp_INVALID;
5856         if (isPX) {
5857            tWB = newTemp(Ity_I64);
5858            assign(tWB, binop(Iop_Add64,
5859                              mkexpr(tTA),
5860                              mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
5861                                                     : getIReg64orZR(mm)));
5862         }
5863
5864         /* -- BEGIN generate the transfers -- */
5865
5866         IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
5867         u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
5868         switch (nRegs) {
5869            case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
5870            case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
5871            case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
5872            case 1: u0 = newTempV128(); i0 = newTempV128(); break;
5873            default: vassert(0);
5874         }
5875
5876         /* -- Multiple 128 or 64 bit stores -- */
5877         if (!isLD) {
5878            switch (nRegs) {
5879               case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
5880               case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
5881               case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
5882               case 1: assign(u0, getQReg128((tt+0) % 32)); break;
5883               default: vassert(0);
5884            }
5885            switch (nRegs) {
5886               case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
5887                           (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
5888                        break;
5889               case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
5890                           (&i0, &i1, &i2, sz, u0, u1, u2);
5891                        break;
5892               case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
5893                           (&i0, &i1, sz, u0, u1);
5894                        break;
5895               case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
5896                           (&i0, sz, u0);
5897                        break;
5898               default: vassert(0);
5899            }
5900#           define MAYBE_NARROW_TO_64(_expr) \
5901                      (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
5902            UInt step = isQ ? 16 : 8;
5903            switch (nRegs) {
5904               case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
5905                                 MAYBE_NARROW_TO_64(mkexpr(i3)) );
5906                        /* fallthru */
5907               case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
5908                                 MAYBE_NARROW_TO_64(mkexpr(i2)) );
5909                        /* fallthru */
5910               case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
5911                                 MAYBE_NARROW_TO_64(mkexpr(i1)) );
5912                        /* fallthru */
5913               case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
5914                                 MAYBE_NARROW_TO_64(mkexpr(i0)) );
5915                        break;
5916               default: vassert(0);
5917            }
5918#           undef MAYBE_NARROW_TO_64
5919         }
5920
5921         /* -- Multiple 128 or 64 bit loads -- */
5922         else /* isLD */ {
5923            UInt   step   = isQ ? 16 : 8;
5924            IRType loadTy = isQ ? Ity_V128 : Ity_I64;
5925#           define MAYBE_WIDEN_FROM_64(_expr) \
5926                      (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
5927            switch (nRegs) {
5928               case 4:
5929                  assign(i3, MAYBE_WIDEN_FROM_64(
5930                                loadLE(loadTy,
5931                                       binop(Iop_Add64, mkexpr(tTA),
5932                                                        mkU64(3 * step)))));
5933                  /* fallthru */
5934               case 3:
5935                  assign(i2, MAYBE_WIDEN_FROM_64(
5936                                loadLE(loadTy,
5937                                       binop(Iop_Add64, mkexpr(tTA),
5938                                                        mkU64(2 * step)))));
5939                  /* fallthru */
5940               case 2:
5941                  assign(i1, MAYBE_WIDEN_FROM_64(
5942                                loadLE(loadTy,
5943                                       binop(Iop_Add64, mkexpr(tTA),
5944                                                        mkU64(1 * step)))));
5945                  /* fallthru */
5946               case 1:
5947                  assign(i0, MAYBE_WIDEN_FROM_64(
5948                                loadLE(loadTy,
5949                                       binop(Iop_Add64, mkexpr(tTA),
5950                                                        mkU64(0 * step)))));
5951                  break;
5952               default:
5953                  vassert(0);
5954            }
5955#           undef MAYBE_WIDEN_FROM_64
5956            switch (nRegs) {
5957               case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
5958                           (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
5959                        break;
5960               case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
5961                           (&u0, &u1, &u2, sz, i0, i1, i2);
5962                        break;
5963               case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
5964                           (&u0, &u1, sz, i0, i1);
5965                        break;
5966               case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
5967                           (&u0, sz, i0);
5968                        break;
5969               default: vassert(0);
5970            }
5971            switch (nRegs) {
5972               case 4:  putQReg128( (tt+3) % 32,
5973                                    math_MAYBE_ZERO_HI64(bitQ, u3));
5974                        /* fallthru */
5975               case 3:  putQReg128( (tt+2) % 32,
5976                                    math_MAYBE_ZERO_HI64(bitQ, u2));
5977                        /* fallthru */
5978               case 2:  putQReg128( (tt+1) % 32,
5979                                    math_MAYBE_ZERO_HI64(bitQ, u1));
5980                        /* fallthru */
5981               case 1:  putQReg128( (tt+0) % 32,
5982                                    math_MAYBE_ZERO_HI64(bitQ, u0));
5983                        break;
5984               default: vassert(0);
5985            }
5986         }
5987
5988         /* -- END generate the transfers -- */
5989
5990         /* Do the writeback, if necessary */
5991         if (isPX) {
5992            putIReg64orSP(nn, mkexpr(tWB));
5993         }
5994
5995         HChar pxStr[20];
5996         pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
5997         if (isPX) {
5998            if (mm == BITS5(1,1,1,1,1))
5999               vex_sprintf(pxStr, ", #%u", xferSzB);
6000            else
6001               vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6002         }
6003         const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6004         DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6005             isLD ? "ld" : "st", nRegs,
6006             (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6007             pxStr);
6008
6009         return True;
6010      }
6011      /* else fall through */
6012   }
6013
6014   /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6015   /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6016   /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6017   /* 31 29  26   22 21 20    15   11 9 4
6018
6019      0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6020      0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6021
6022      0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6023      0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6024
6025      0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6026      0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6027
6028      T    = defined by Q and sz in the normal way
6029      step = if m == 11111 then transfer-size else Xm
6030      xx   = case L of 1 -> LD ; 0 -> ST
6031   */
6032   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6033       && INSN(21,21) == 0) {
6034      Bool bitQ  = INSN(30,30);
6035      Bool isPX  = INSN(23,23) == 1;
6036      Bool isLD  = INSN(22,22) == 1;
6037      UInt mm    = INSN(20,16);
6038      UInt opc   = INSN(15,12);
6039      UInt sz    = INSN(11,10);
6040      UInt nn    = INSN(9,5);
6041      UInt tt    = INSN(4,0);
6042      Bool isQ   = bitQ == 1;
6043      UInt nRegs = 0;
6044      switch (opc) {
6045         case BITS4(0,0,1,0): nRegs = 4; break;
6046         case BITS4(0,1,1,0): nRegs = 3; break;
6047         case BITS4(1,0,1,0): nRegs = 2; break;
6048         default: break;
6049      }
6050
6051      /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6052         If we see it, set nRegs to 0 so as to cause the next conditional
6053         to fail. */
6054      if (!isPX && mm != 0)
6055         nRegs = 0;
6056
6057      if (nRegs >= 2 && nRegs <= 4) {
6058
6059         UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6060
6061         /* Generate the transfer address (TA) and if necessary the
6062            writeback address (WB) */
6063         IRTemp tTA = newTemp(Ity_I64);
6064         assign(tTA, getIReg64orSP(nn));
6065         if (nn == 31) { /* FIXME generate stack alignment check */ }
6066         IRTemp tWB = IRTemp_INVALID;
6067         if (isPX) {
6068            tWB = newTemp(Ity_I64);
6069            assign(tWB, binop(Iop_Add64,
6070                              mkexpr(tTA),
6071                              mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6072                                                     : getIReg64orZR(mm)));
6073         }
6074
6075         /* -- BEGIN generate the transfers -- */
6076
6077         IRTemp u0, u1, u2, u3;
6078         u0 = u1 = u2 = u3 = IRTemp_INVALID;
6079         switch (nRegs) {
6080            case 4: u3 = newTempV128(); /* fallthru */
6081            case 3: u2 = newTempV128(); /* fallthru */
6082            case 2: u1 = newTempV128();
6083                    u0 = newTempV128(); break;
6084            default: vassert(0);
6085         }
6086
6087         /* -- Multiple 128 or 64 bit stores -- */
6088         if (!isLD) {
6089            switch (nRegs) {
6090               case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6091               case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6092               case 2: assign(u1, getQReg128((tt+1) % 32));
6093                       assign(u0, getQReg128((tt+0) % 32)); break;
6094               default: vassert(0);
6095            }
6096#           define MAYBE_NARROW_TO_64(_expr) \
6097                      (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6098            UInt step = isQ ? 16 : 8;
6099            switch (nRegs) {
6100               case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6101                                 MAYBE_NARROW_TO_64(mkexpr(u3)) );
6102                        /* fallthru */
6103               case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6104                                 MAYBE_NARROW_TO_64(mkexpr(u2)) );
6105                        /* fallthru */
6106               case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6107                                 MAYBE_NARROW_TO_64(mkexpr(u1)) );
6108                        storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6109                                 MAYBE_NARROW_TO_64(mkexpr(u0)) );
6110                        break;
6111               default: vassert(0);
6112            }
6113#           undef MAYBE_NARROW_TO_64
6114         }
6115
6116         /* -- Multiple 128 or 64 bit loads -- */
6117         else /* isLD */ {
6118            UInt   step   = isQ ? 16 : 8;
6119            IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6120#           define MAYBE_WIDEN_FROM_64(_expr) \
6121                      (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6122            switch (nRegs) {
6123               case 4:
6124                  assign(u3, MAYBE_WIDEN_FROM_64(
6125                                loadLE(loadTy,
6126                                       binop(Iop_Add64, mkexpr(tTA),
6127                                                        mkU64(3 * step)))));
6128                  /* fallthru */
6129               case 3:
6130                  assign(u2, MAYBE_WIDEN_FROM_64(
6131                                loadLE(loadTy,
6132                                       binop(Iop_Add64, mkexpr(tTA),
6133                                                        mkU64(2 * step)))));
6134                  /* fallthru */
6135               case 2:
6136                  assign(u1, MAYBE_WIDEN_FROM_64(
6137                                loadLE(loadTy,
6138                                       binop(Iop_Add64, mkexpr(tTA),
6139                                                        mkU64(1 * step)))));
6140                  assign(u0, MAYBE_WIDEN_FROM_64(
6141                                loadLE(loadTy,
6142                                       binop(Iop_Add64, mkexpr(tTA),
6143                                                        mkU64(0 * step)))));
6144                  break;
6145               default:
6146                  vassert(0);
6147            }
6148#           undef MAYBE_WIDEN_FROM_64
6149            switch (nRegs) {
6150               case 4:  putQReg128( (tt+3) % 32,
6151                                    math_MAYBE_ZERO_HI64(bitQ, u3));
6152                        /* fallthru */
6153               case 3:  putQReg128( (tt+2) % 32,
6154                                    math_MAYBE_ZERO_HI64(bitQ, u2));
6155                        /* fallthru */
6156               case 2:  putQReg128( (tt+1) % 32,
6157                                    math_MAYBE_ZERO_HI64(bitQ, u1));
6158                        putQReg128( (tt+0) % 32,
6159                                    math_MAYBE_ZERO_HI64(bitQ, u0));
6160                        break;
6161               default: vassert(0);
6162            }
6163         }
6164
6165         /* -- END generate the transfers -- */
6166
6167         /* Do the writeback, if necessary */
6168         if (isPX) {
6169            putIReg64orSP(nn, mkexpr(tWB));
6170         }
6171
6172         HChar pxStr[20];
6173         pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6174         if (isPX) {
6175            if (mm == BITS5(1,1,1,1,1))
6176               vex_sprintf(pxStr, ", #%u", xferSzB);
6177            else
6178               vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6179         }
6180         const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6181         DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6182             isLD ? "ld" : "st",
6183             (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6184             pxStr);
6185
6186         return True;
6187      }
6188      /* else fall through */
6189   }
6190
6191   /* ---------- LD1R (single structure, replicate) ---------- */
6192   /* ---------- LD2R (single structure, replicate) ---------- */
6193   /* ---------- LD3R (single structure, replicate) ---------- */
6194   /* ---------- LD4R (single structure, replicate) ---------- */
6195   /* 31 29       22 20    15    11 9 4
6196      0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6197      0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6198
6199      0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6200      0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6201
6202      0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6203      0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6204
6205      0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6206      0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6207
6208      step = if m == 11111 then transfer-size else Xm
6209   */
6210   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6211       && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6212       && INSN(12,12) == 0) {
6213      UInt   bitQ  = INSN(30,30);
6214      Bool   isPX  = INSN(23,23) == 1;
6215      UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6216      UInt   mm    = INSN(20,16);
6217      UInt   sz    = INSN(11,10);
6218      UInt   nn    = INSN(9,5);
6219      UInt   tt    = INSN(4,0);
6220
6221      /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6222      if (isPX || mm == 0) {
6223
6224         IRType ty    = integerIRTypeOfSize(1 << sz);
6225
6226         UInt laneSzB = 1 << sz;
6227         UInt xferSzB = laneSzB * nRegs;
6228
6229         /* Generate the transfer address (TA) and if necessary the
6230            writeback address (WB) */
6231         IRTemp tTA = newTemp(Ity_I64);
6232         assign(tTA, getIReg64orSP(nn));
6233         if (nn == 31) { /* FIXME generate stack alignment check */ }
6234         IRTemp tWB = IRTemp_INVALID;
6235         if (isPX) {
6236            tWB = newTemp(Ity_I64);
6237            assign(tWB, binop(Iop_Add64,
6238                              mkexpr(tTA),
6239                              mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6240                                                     : getIReg64orZR(mm)));
6241         }
6242
6243         /* Do the writeback, if necessary */
6244         if (isPX) {
6245            putIReg64orSP(nn, mkexpr(tWB));
6246         }
6247
6248         IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6249         e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6250         switch (nRegs) {
6251            case 4:
6252               e3 = newTemp(ty);
6253               assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6254                                                      mkU64(3 * laneSzB))));
6255               v3 = math_DUP_TO_V128(e3, ty);
6256               putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6257               /* fallthrough */
6258            case 3:
6259               e2 = newTemp(ty);
6260               assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6261                                                      mkU64(2 * laneSzB))));
6262               v2 = math_DUP_TO_V128(e2, ty);
6263               putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6264               /* fallthrough */
6265            case 2:
6266               e1 = newTemp(ty);
6267               assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6268                                                      mkU64(1 * laneSzB))));
6269               v1 = math_DUP_TO_V128(e1, ty);
6270               putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6271               /* fallthrough */
6272            case 1:
6273               e0 = newTemp(ty);
6274               assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6275                                                      mkU64(0 * laneSzB))));
6276               v0 = math_DUP_TO_V128(e0, ty);
6277               putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6278               break;
6279            default:
6280               vassert(0);
6281         }
6282
6283         HChar pxStr[20];
6284         pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6285         if (isPX) {
6286            if (mm == BITS5(1,1,1,1,1))
6287               vex_sprintf(pxStr, ", #%u", xferSzB);
6288            else
6289               vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6290         }
6291         const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6292         DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6293             nRegs,
6294             (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6295             pxStr);
6296
6297         return True;
6298      }
6299      /* else fall through */
6300   }
6301
6302   /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6303   /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6304   /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6305   /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6306   /* 31 29       22 21 20    15    11 9 4
6307      0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6308      0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6309
6310      0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6311      0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6312
6313      0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6314      0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6315
6316      0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6317      0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6318
6319      step = if m == 11111 then transfer-size else Xm
6320      op   = case L of 1 -> LD ; 0 -> ST
6321
6322      laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6323                                     01:b:b:b0 -> 2, bbb
6324                                     10:b:b:00 -> 4, bb
6325                                     10:b:0:01 -> 8, b
6326   */
6327   if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6328      UInt   bitQ  = INSN(30,30);
6329      Bool   isPX  = INSN(23,23) == 1;
6330      Bool   isLD  = INSN(22,22) == 1;
6331      UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6332      UInt   mm    = INSN(20,16);
6333      UInt   xx    = INSN(15,14);
6334      UInt   bitS  = INSN(12,12);
6335      UInt   sz    = INSN(11,10);
6336      UInt   nn    = INSN(9,5);
6337      UInt   tt    = INSN(4,0);
6338
6339      Bool valid = True;
6340
6341      /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6342      if (!isPX && mm != 0)
6343         valid = False;
6344
6345      UInt laneSzB = 0;  /* invalid */
6346      UInt ix      = 16; /* invalid */
6347
6348      UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6349      switch (xx_q_S_sz) {
6350         case 0x00: case 0x01: case 0x02: case 0x03:
6351         case 0x04: case 0x05: case 0x06: case 0x07:
6352         case 0x08: case 0x09: case 0x0A: case 0x0B:
6353         case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6354            laneSzB = 1; ix = xx_q_S_sz & 0xF;
6355            break;
6356         case 0x10: case 0x12: case 0x14: case 0x16:
6357         case 0x18: case 0x1A: case 0x1C: case 0x1E:
6358            laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6359            break;
6360         case 0x20: case 0x24: case 0x28: case 0x2C:
6361            laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6362            break;
6363         case 0x21: case 0x29:
6364            laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6365            break;
6366         default:
6367            break;
6368      }
6369
6370      if (valid && laneSzB != 0) {
6371
6372         IRType ty      = integerIRTypeOfSize(laneSzB);
6373         UInt   xferSzB = laneSzB * nRegs;
6374
6375         /* Generate the transfer address (TA) and if necessary the
6376            writeback address (WB) */
6377         IRTemp tTA = newTemp(Ity_I64);
6378         assign(tTA, getIReg64orSP(nn));
6379         if (nn == 31) { /* FIXME generate stack alignment check */ }
6380         IRTemp tWB = IRTemp_INVALID;
6381         if (isPX) {
6382            tWB = newTemp(Ity_I64);
6383            assign(tWB, binop(Iop_Add64,
6384                              mkexpr(tTA),
6385                              mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6386                                                     : getIReg64orZR(mm)));
6387         }
6388
6389         /* Do the writeback, if necessary */
6390         if (isPX) {
6391            putIReg64orSP(nn, mkexpr(tWB));
6392         }
6393
6394         switch (nRegs) {
6395            case 4: {
6396               IRExpr* addr
6397                  = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6398               if (isLD) {
6399                  putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6400               } else {
6401                  storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6402               }
6403               /* fallthrough */
6404            }
6405            case 3: {
6406               IRExpr* addr
6407                  = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6408               if (isLD) {
6409                  putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6410               } else {
6411                  storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6412               }
6413               /* fallthrough */
6414            }
6415            case 2: {
6416               IRExpr* addr
6417                  = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6418               if (isLD) {
6419                  putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6420               } else {
6421                  storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6422               }
6423               /* fallthrough */
6424            }
6425            case 1: {
6426               IRExpr* addr
6427                  = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6428               if (isLD) {
6429                  putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6430               } else {
6431                  storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6432               }
6433               break;
6434            }
6435            default:
6436               vassert(0);
6437         }
6438
6439         HChar pxStr[20];
6440         pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6441         if (isPX) {
6442            if (mm == BITS5(1,1,1,1,1))
6443               vex_sprintf(pxStr, ", #%u", xferSzB);
6444            else
6445               vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6446         }
6447         const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6448         DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6449             isLD ? "ld" : "st", nRegs,
6450             (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6451             ix, nameIReg64orSP(nn), pxStr);
6452
6453         return True;
6454      }
6455      /* else fall through */
6456   }
6457
6458   /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6459   /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6460   /* 31 29     23  20      14    9 4
6461      sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6462      sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6463      sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6464      sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6465   */
6466   /* For the "standard" implementation we pass through the LL and SC to
6467      the host.  For the "fallback" implementation, for details see
6468        https://bugs.kde.org/show_bug.cgi?id=344524 and
6469        https://bugs.kde.org/show_bug.cgi?id=369459,
6470      but in short:
6471
6472      LoadLinked(addr)
6473        gs.LLsize = load_size // 1, 2, 4 or 8
6474        gs.LLaddr = addr
6475        gs.LLdata = zeroExtend(*addr)
6476
6477      StoreCond(addr, data)
6478        tmp_LLsize = gs.LLsize
6479        gs.LLsize = 0 // "no transaction"
6480        if tmp_LLsize != store_size        -> fail
6481        if addr != gs.LLaddr               -> fail
6482        if zeroExtend(*addr) != gs.LLdata  -> fail
6483        cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6484        if !cas_ok                         -> fail
6485        succeed
6486
6487      When thread scheduled
6488        gs.LLsize = 0 // "no transaction"
6489        (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6490         has to do this bit)
6491   */
6492   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6493       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6494       && INSN(14,10) == BITS5(1,1,1,1,1)) {
6495      UInt szBlg2     = INSN(31,30);
6496      Bool isLD       = INSN(22,22) == 1;
6497      Bool isAcqOrRel = INSN(15,15) == 1;
6498      UInt ss         = INSN(20,16);
6499      UInt nn         = INSN(9,5);
6500      UInt tt         = INSN(4,0);
6501
6502      vassert(szBlg2 < 4);
6503      UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6504      IRType ty  = integerIRTypeOfSize(szB);
6505      const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6506
6507      IRTemp ea = newTemp(Ity_I64);
6508      assign(ea, getIReg64orSP(nn));
6509      /* FIXME generate check that ea is szB-aligned */
6510
6511      if (isLD && ss == BITS5(1,1,1,1,1)) {
6512         IRTemp res = newTemp(ty);
6513         if (abiinfo->guest__use_fallback_LLSC) {
6514            // Do the load first so we don't update any guest state
6515            // if it faults.
6516            IRTemp loaded_data64 = newTemp(Ity_I64);
6517            assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6518            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6519            stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6520            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6521            putIReg64orZR(tt, mkexpr(loaded_data64));
6522         } else {
6523            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6524            putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6525         }
6526         if (isAcqOrRel) {
6527            stmt(IRStmt_MBE(Imbe_Fence));
6528         }
6529         DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6530             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6531             abiinfo->guest__use_fallback_LLSC
6532                ? "(fallback implementation)" : "");
6533         return True;
6534      }
6535      if (!isLD) {
6536         if (isAcqOrRel) {
6537            stmt(IRStmt_MBE(Imbe_Fence));
6538         }
6539         IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6540         if (abiinfo->guest__use_fallback_LLSC) {
6541            // This is really ugly, since we don't have any way to do
6542            // proper if-then-else.  First, set up as if the SC failed,
6543            // and jump forwards if it really has failed.
6544
6545            // Continuation address
6546            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6547
6548            // "the SC failed".  Any non-zero value means failure.
6549            putIReg64orZR(ss, mkU64(1));
6550
6551            IRTemp tmp_LLsize = newTemp(Ity_I64);
6552            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6553            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6554            ));
6555            // Fail if no or wrong-size transaction
6556            vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6557            stmt( IRStmt_Exit(
6558                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6559                     Ijk_Boring, nia, OFFB_PC
6560            ));
6561            // Fail if the address doesn't match the LL address
6562            stmt( IRStmt_Exit(
6563                      binop(Iop_CmpNE64, mkexpr(ea),
6564                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6565                      Ijk_Boring, nia, OFFB_PC
6566            ));
6567            // Fail if the data doesn't match the LL data
6568            IRTemp llsc_data64 = newTemp(Ity_I64);
6569            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6570            stmt( IRStmt_Exit(
6571                      binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6572                                         mkexpr(llsc_data64)),
6573                      Ijk_Boring, nia, OFFB_PC
6574            ));
6575            // Try to CAS the new value in.
6576            IRTemp old = newTemp(ty);
6577            IRTemp expd = newTemp(ty);
6578            assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6579            stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6580                                     Iend_LE, mkexpr(ea),
6581                                     /*expdHi*/NULL, mkexpr(expd),
6582                                     /*dataHi*/NULL, data
6583            )));
6584            // Fail if the CAS failed (viz, old != expd)
6585            stmt( IRStmt_Exit(
6586                      binop(Iop_CmpNE64,
6587                            widenUto64(ty, mkexpr(old)),
6588                            widenUto64(ty, mkexpr(expd))),
6589                      Ijk_Boring, nia, OFFB_PC
6590            ));
6591            // Otherwise we succeeded (!)
6592            putIReg64orZR(ss, mkU64(0));
6593         } else {
6594            IRTemp res = newTemp(Ity_I1);
6595            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6596            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6597               Need to set rS to 1 on failure, 0 on success. */
6598            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6599                                               mkU64(1)));
6600         }
6601         DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6602             nameIRegOrZR(False, ss),
6603             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6604             abiinfo->guest__use_fallback_LLSC
6605                ? "(fallback implementation)" : "");
6606         return True;
6607      }
6608      /* else fall through */
6609   }
6610
6611   /* ------------------ LDA{R,RH,RB} ------------------ */
6612   /* ------------------ STL{R,RH,RB} ------------------ */
6613   /* 31 29     23  20      14    9 4
6614      sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
6615      sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
6616   */
6617   if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6618       && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
6619      UInt szBlg2 = INSN(31,30);
6620      Bool isLD   = INSN(22,22) == 1;
6621      UInt nn     = INSN(9,5);
6622      UInt tt     = INSN(4,0);
6623
6624      vassert(szBlg2 < 4);
6625      UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6626      IRType ty  = integerIRTypeOfSize(szB);
6627      const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6628
6629      IRTemp ea = newTemp(Ity_I64);
6630      assign(ea, getIReg64orSP(nn));
6631      /* FIXME generate check that ea is szB-aligned */
6632
6633      if (isLD) {
6634         IRTemp res = newTemp(ty);
6635         assign(res, loadLE(ty, mkexpr(ea)));
6636         putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6637         stmt(IRStmt_MBE(Imbe_Fence));
6638         DIP("lda%s %s, [%s]\n", suffix[szBlg2],
6639             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6640      } else {
6641         stmt(IRStmt_MBE(Imbe_Fence));
6642         IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6643         storeLE(mkexpr(ea), data);
6644         DIP("stl%s %s, [%s]\n", suffix[szBlg2],
6645             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6646      }
6647      return True;
6648   }
6649
6650   /* The PRFM cases that follow are possibly allow Rt values (the
6651      prefetch operation) which are not allowed by the documentation.
6652      This should be looked into. */
6653   /* ------------------ PRFM (immediate) ------------------ */
6654   /* 31           21    9 4
6655      11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
6656   */
6657   if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
6658      UInt imm12 = INSN(21,10);
6659      UInt nn    = INSN(9,5);
6660      UInt tt    = INSN(4,0);
6661      /* Generating any IR here is pointless, except for documentation
6662         purposes, as it will get optimised away later. */
6663      IRTemp ea = newTemp(Ity_I64);
6664      assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
6665      DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
6666      return True;
6667   }
6668
6669   /* ------------------ PRFM (register) ------------------ */
6670   /* 31 29      22 20 15  12 11 9  4
6671      11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
6672   */
6673   if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
6674       && INSN(11,10) == BITS2(1,0)) {
6675      HChar  dis_buf[64];
6676      UInt   tt = INSN(4,0);
6677      IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
6678      if (ea != IRTemp_INVALID) {
6679         /* No actual code to generate. */
6680         DIP("prfm prfop=%u, %s\n", tt, dis_buf);
6681         return True;
6682      }
6683   }
6684
6685   /* ------------------ PRFM (unscaled offset) ------------------ */
6686   /* 31 29      22 20   11 9  4
6687      11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
6688   */
6689   if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
6690       && INSN(11,10) == BITS2(0,0)) {
6691      ULong  imm9   = INSN(20,12);
6692      UInt   nn     = INSN(9,5);
6693      UInt   tt     = INSN(4,0);
6694      ULong  offset = sx_to_64(imm9, 9);
6695      IRTemp ea     = newTemp(Ity_I64);
6696      assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
6697      /* No actual code to generate. */
6698      DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
6699      return True;
6700   }
6701
6702   vex_printf("ARM64 front end: load_store\n");
6703   return False;
6704#  undef INSN
6705}
6706
6707
6708/*------------------------------------------------------------*/
6709/*--- Control flow and misc instructions                   ---*/
6710/*------------------------------------------------------------*/
6711
6712static
6713Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
6714                          const VexArchInfo* archinfo,
6715                          const VexAbiInfo* abiinfo)
6716{
6717#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
6718
6719   /* ---------------------- B cond ----------------------- */
6720   /* 31        24    4 3
6721      0101010 0 imm19 0 cond */
6722   if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
6723      UInt  cond   = INSN(3,0);
6724      ULong uimm64 = INSN(23,5) << 2;
6725      Long  simm64 = (Long)sx_to_64(uimm64, 21);
6726      vassert(dres->whatNext    == Dis_Continue);
6727      vassert(dres->len         == 4);
6728      vassert(dres->continueAt  == 0);
6729      vassert(dres->jk_StopHere == Ijk_INVALID);
6730      stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
6731                        Ijk_Boring,
6732                        IRConst_U64(guest_PC_curr_instr + simm64),
6733                        OFFB_PC) );
6734      putPC(mkU64(guest_PC_curr_instr + 4));
6735      dres->whatNext    = Dis_StopHere;
6736      dres->jk_StopHere = Ijk_Boring;
6737      DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
6738      return True;
6739   }
6740
6741   /* -------------------- B{L} uncond -------------------- */
6742   if (INSN(30,26) == BITS5(0,0,1,0,1)) {
6743      /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
6744         100101 imm26  B  (PC + sxTo64(imm26 << 2))
6745      */
6746      UInt  bLink  = INSN(31,31);
6747      ULong uimm64 = INSN(25,0) << 2;
6748      Long  simm64 = (Long)sx_to_64(uimm64, 28);
6749      if (bLink) {
6750         putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6751      }
6752      putPC(mkU64(guest_PC_curr_instr + simm64));
6753      dres->whatNext = Dis_StopHere;
6754      dres->jk_StopHere = Ijk_Call;
6755      DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
6756                          guest_PC_curr_instr + simm64);
6757      return True;
6758   }
6759
6760   /* --------------------- B{L} reg --------------------- */
6761   /* 31      24 22 20    15     9  4
6762      1101011 00 10 11111 000000 nn 00000  RET  Rn
6763      1101011 00 01 11111 000000 nn 00000  CALL Rn
6764      1101011 00 00 11111 000000 nn 00000  JMP  Rn
6765   */
6766   if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
6767       && INSN(20,16) == BITS5(1,1,1,1,1)
6768       && INSN(15,10) == BITS6(0,0,0,0,0,0)
6769       && INSN(4,0) == BITS5(0,0,0,0,0)) {
6770      UInt branch_type = INSN(22,21);
6771      UInt nn          = INSN(9,5);
6772      if (branch_type == BITS2(1,0) /* RET */) {
6773         putPC(getIReg64orZR(nn));
6774         dres->whatNext = Dis_StopHere;
6775         dres->jk_StopHere = Ijk_Ret;
6776         DIP("ret %s\n", nameIReg64orZR(nn));
6777         return True;
6778      }
6779      if (branch_type == BITS2(0,1) /* CALL */) {
6780         IRTemp dst = newTemp(Ity_I64);
6781         assign(dst, getIReg64orZR(nn));
6782         putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6783         putPC(mkexpr(dst));
6784         dres->whatNext = Dis_StopHere;
6785         dres->jk_StopHere = Ijk_Call;
6786         DIP("blr %s\n", nameIReg64orZR(nn));
6787         return True;
6788      }
6789      if (branch_type == BITS2(0,0) /* JMP */) {
6790         putPC(getIReg64orZR(nn));
6791         dres->whatNext = Dis_StopHere;
6792         dres->jk_StopHere = Ijk_Boring;
6793         DIP("jmp %s\n", nameIReg64orZR(nn));
6794         return True;
6795      }
6796   }
6797
6798   /* -------------------- CB{N}Z -------------------- */
6799   /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
6800      sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
6801   */
6802   if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
6803      Bool    is64   = INSN(31,31) == 1;
6804      Bool    bIfZ   = INSN(24,24) == 0;
6805      ULong   uimm64 = INSN(23,5) << 2;
6806      UInt    rT     = INSN(4,0);
6807      Long    simm64 = (Long)sx_to_64(uimm64, 21);
6808      IRExpr* cond   = NULL;
6809      if (is64) {
6810         cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
6811                      getIReg64orZR(rT), mkU64(0));
6812      } else {
6813         cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
6814                      getIReg32orZR(rT), mkU32(0));
6815      }
6816      stmt( IRStmt_Exit(cond,
6817                        Ijk_Boring,
6818                        IRConst_U64(guest_PC_curr_instr + simm64),
6819                        OFFB_PC) );
6820      putPC(mkU64(guest_PC_curr_instr + 4));
6821      dres->whatNext    = Dis_StopHere;
6822      dres->jk_StopHere = Ijk_Boring;
6823      DIP("cb%sz %s, 0x%llx\n",
6824          bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
6825          guest_PC_curr_instr + simm64);
6826      return True;
6827   }
6828
6829   /* -------------------- TB{N}Z -------------------- */
6830   /* 31 30      24 23  18  5 4
6831      b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6832      b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6833   */
6834   if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
6835      UInt    b5     = INSN(31,31);
6836      Bool    bIfZ   = INSN(24,24) == 0;
6837      UInt    b40    = INSN(23,19);
6838      UInt    imm14  = INSN(18,5);
6839      UInt    tt     = INSN(4,0);
6840      UInt    bitNo  = (b5 << 5) | b40;
6841      ULong   uimm64 = imm14 << 2;
6842      Long    simm64 = sx_to_64(uimm64, 16);
6843      IRExpr* cond
6844         = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
6845                 binop(Iop_And64,
6846                       binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
6847                       mkU64(1)),
6848                 mkU64(0));
6849      stmt( IRStmt_Exit(cond,
6850                        Ijk_Boring,
6851                        IRConst_U64(guest_PC_curr_instr + simm64),
6852                        OFFB_PC) );
6853      putPC(mkU64(guest_PC_curr_instr + 4));
6854      dres->whatNext    = Dis_StopHere;
6855      dres->jk_StopHere = Ijk_Boring;
6856      DIP("tb%sz %s, #%u, 0x%llx\n",
6857          bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
6858          guest_PC_curr_instr + simm64);
6859      return True;
6860   }
6861
6862   /* -------------------- SVC -------------------- */
6863   /* 11010100 000 imm16 000 01
6864      Don't bother with anything except the imm16==0 case.
6865   */
6866   if (INSN(31,0) == 0xD4000001) {
6867      putPC(mkU64(guest_PC_curr_instr + 4));
6868      dres->whatNext    = Dis_StopHere;
6869      dres->jk_StopHere = Ijk_Sys_syscall;
6870      DIP("svc #0\n");
6871      return True;
6872   }
6873
6874   /* ------------------ M{SR,RS} ------------------ */
6875   /* ---- Cases for TPIDR_EL0 ----
6876      0xD51BD0 010 Rt   MSR tpidr_el0, rT
6877      0xD53BD0 010 Rt   MRS rT, tpidr_el0
6878   */
6879   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
6880       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
6881      Bool toSys = INSN(21,21) == 0;
6882      UInt tt    = INSN(4,0);
6883      if (toSys) {
6884         stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
6885         DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
6886      } else {
6887         putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
6888         DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
6889      }
6890      return True;
6891   }
6892   /* ---- Cases for FPCR ----
6893      0xD51B44 000 Rt  MSR fpcr, rT
6894      0xD53B44 000 Rt  MSR rT, fpcr
6895   */
6896   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
6897       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
6898      Bool toSys = INSN(21,21) == 0;
6899      UInt tt    = INSN(4,0);
6900      if (toSys) {
6901         stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
6902         DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
6903      } else {
6904         putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
6905         DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
6906      }
6907      return True;
6908   }
6909   /* ---- Cases for FPSR ----
6910      0xD51B44 001 Rt  MSR fpsr, rT
6911      0xD53B44 001 Rt  MSR rT, fpsr
6912      The only part of this we model is FPSR.QC.  All other bits
6913      are ignored when writing to it and RAZ when reading from it.
6914   */
6915   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
6916       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
6917      Bool toSys = INSN(21,21) == 0;
6918      UInt tt    = INSN(4,0);
6919      if (toSys) {
6920         /* Just deal with FPSR.QC.  Make up a V128 value which is
6921            zero if Xt[27] is zero and any other value if Xt[27] is
6922            nonzero. */
6923         IRTemp qc64 = newTemp(Ity_I64);
6924         assign(qc64, binop(Iop_And64,
6925                            binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
6926                            mkU64(1)));
6927         IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
6928         stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
6929         DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
6930      } else {
6931         /* Generate a value which is all zeroes except for bit 27,
6932            which must be zero if QCFLAG is all zeroes and one otherwise. */
6933         IRTemp qcV128 = newTempV128();
6934         assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
6935         IRTemp qc64 = newTemp(Ity_I64);
6936         assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
6937                                      unop(Iop_V128to64,   mkexpr(qcV128))));
6938         IRExpr* res = binop(Iop_Shl64,
6939                             unop(Iop_1Uto64,
6940                                  binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
6941                             mkU8(27));
6942         putIReg64orZR(tt, res);
6943         DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
6944      }
6945      return True;
6946   }
6947   /* ---- Cases for NZCV ----
6948      D51B42 000 Rt  MSR nzcv, rT
6949      D53B42 000 Rt  MRS rT, nzcv
6950      The only parts of NZCV that actually exist are bits 31:28, which
6951      are the N Z C and V bits themselves.  Hence the flags thunk provides
6952      all the state we need.
6953   */
6954   if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
6955       || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
6956      Bool  toSys = INSN(21,21) == 0;
6957      UInt  tt    = INSN(4,0);
6958      if (toSys) {
6959         IRTemp t = newTemp(Ity_I64);
6960         assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
6961         setFlags_COPY(t);
6962         DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
6963      } else {
6964         IRTemp res = newTemp(Ity_I64);
6965         assign(res, mk_arm64g_calculate_flags_nzcv());
6966         putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
6967         DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
6968      }
6969      return True;
6970   }
6971   /* ---- Cases for DCZID_EL0 ----
6972      Don't support arbitrary reads and writes to this register.  Just
6973      return the value 16, which indicates that the DC ZVA instruction
6974      is not permitted, so we don't have to emulate it.
6975      D5 3B 00 111 Rt  MRS rT, dczid_el0
6976   */
6977   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
6978      UInt tt = INSN(4,0);
6979      putIReg64orZR(tt, mkU64(1<<4));
6980      DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
6981      return True;
6982   }
6983   /* ---- Cases for CTR_EL0 ----
6984      We just handle reads, and make up a value from the D and I line
6985      sizes in the VexArchInfo we are given, and patch in the following
6986      fields that the Foundation model gives ("natively"):
6987      CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
6988      D5 3B 00 001 Rt  MRS rT, dczid_el0
6989   */
6990   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
6991      UInt tt = INSN(4,0);
6992      /* Need to generate a value from dMinLine_lg2_szB and
6993         dMinLine_lg2_szB.  The value in the register is in 32-bit
6994         units, so need to subtract 2 from the values in the
6995         VexArchInfo.  We can assume that the values here are valid --
6996         disInstr_ARM64 checks them -- so there's no need to deal with
6997         out-of-range cases. */
6998      vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
6999              && archinfo->arm64_dMinLine_lg2_szB <= 17
7000              && archinfo->arm64_iMinLine_lg2_szB >= 2
7001              && archinfo->arm64_iMinLine_lg2_szB <= 17);
7002      UInt val
7003         = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7004                      | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7005      putIReg64orZR(tt, mkU64(val));
7006      DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7007      return True;
7008   }
7009   /* ---- Cases for CNTVCT_EL0 ----
7010      This is a timestamp counter of some sort.  Support reads of it only
7011      by passing through to the host.
7012      D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7013   */
7014   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7015      UInt     tt   = INSN(4,0);
7016      IRTemp   val  = newTemp(Ity_I64);
7017      IRExpr** args = mkIRExprVec_0();
7018      IRDirty* d    = unsafeIRDirty_1_N (
7019                         val,
7020                         0/*regparms*/,
7021                         "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7022                         &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7023                         args
7024                      );
7025      /* execute the dirty call, dumping the result in val. */
7026      stmt( IRStmt_Dirty(d) );
7027      putIReg64orZR(tt, mkexpr(val));
7028      DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7029      return True;
7030   }
7031   /* ---- Cases for CNTFRQ_EL0 ----
7032      This is always RO at EL0, so it's safe to pass through to the host.
7033      D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7034   */
7035   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7036      UInt     tt   = INSN(4,0);
7037      IRTemp   val  = newTemp(Ity_I64);
7038      IRExpr** args = mkIRExprVec_0();
7039      IRDirty* d    = unsafeIRDirty_1_N (
7040                         val,
7041                         0/*regparms*/,
7042                         "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7043                         &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7044                         args
7045                      );
7046      /* execute the dirty call, dumping the result in val. */
7047      stmt( IRStmt_Dirty(d) );
7048      putIReg64orZR(tt, mkexpr(val));
7049      DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7050      return True;
7051   }
7052
7053   /* ------------------ IC_IVAU ------------------ */
7054   /* D5 0B 75 001 Rt  ic ivau, rT
7055   */
7056   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7057      /* We will always be provided with a valid iMinLine value. */
7058      vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7059              && archinfo->arm64_iMinLine_lg2_szB <= 17);
7060      /* Round the requested address, in rT, down to the start of the
7061         containing block. */
7062      UInt   tt      = INSN(4,0);
7063      ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7064      IRTemp addr    = newTemp(Ity_I64);
7065      assign( addr, binop( Iop_And64,
7066                           getIReg64orZR(tt),
7067                           mkU64(~(lineszB - 1))) );
7068      /* Set the invalidation range, request exit-and-invalidate, with
7069         continuation at the next instruction. */
7070      stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7071      stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7072      /* be paranoid ... */
7073      stmt( IRStmt_MBE(Imbe_Fence) );
7074      putPC(mkU64( guest_PC_curr_instr + 4 ));
7075      dres->whatNext    = Dis_StopHere;
7076      dres->jk_StopHere = Ijk_InvalICache;
7077      DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7078      return True;
7079   }
7080
7081   /* ------------------ DC_CVAU ------------------ */
7082   /* D5 0B 7B 001 Rt  dc cvau, rT
7083   */
7084   if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20) {
7085      /* Exactly the same scheme as for IC IVAU, except we observe the
7086         dMinLine size, and request an Ijk_FlushDCache instead of
7087         Ijk_InvalICache. */
7088      /* We will always be provided with a valid dMinLine value. */
7089      vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7090              && archinfo->arm64_dMinLine_lg2_szB <= 17);
7091      /* Round the requested address, in rT, down to the start of the
7092         containing block. */
7093      UInt   tt      = INSN(4,0);
7094      ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7095      IRTemp addr    = newTemp(Ity_I64);
7096      assign( addr, binop( Iop_And64,
7097                           getIReg64orZR(tt),
7098                           mkU64(~(lineszB - 1))) );
7099      /* Set the flush range, request exit-and-flush, with
7100         continuation at the next instruction. */
7101      stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7102      stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7103      /* be paranoid ... */
7104      stmt( IRStmt_MBE(Imbe_Fence) );
7105      putPC(mkU64( guest_PC_curr_instr + 4 ));
7106      dres->whatNext    = Dis_StopHere;
7107      dres->jk_StopHere = Ijk_FlushDCache;
7108      DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7109      return True;
7110   }
7111
7112   /* ------------------ ISB, DMB, DSB ------------------ */
7113   /* 31          21            11  7 6  4
7114      11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7115      11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7116      11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7117   */
7118   if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7119       && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7120       && INSN(7,7) == 1
7121       && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7122      UInt opc = INSN(6,5);
7123      UInt CRm = INSN(11,8);
7124      vassert(opc <= 2 && CRm <= 15);
7125      stmt(IRStmt_MBE(Imbe_Fence));
7126      const HChar* opNames[3]
7127         = { "dsb", "dmb", "isb" };
7128      const HChar* howNames[16]
7129         = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7130             "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7131      DIP("%s %s\n", opNames[opc], howNames[CRm]);
7132      return True;
7133   }
7134
7135   /* -------------------- NOP -------------------- */
7136   if (INSN(31,0) == 0xD503201F) {
7137      DIP("nop\n");
7138      return True;
7139   }
7140
7141   /* -------------------- BRK -------------------- */
7142   /* 31        23  20    4
7143      1101 0100 001 imm16 00000  BRK #imm16
7144   */
7145   if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7146       && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7147      UInt imm16 = INSN(20,5);
7148      /* Request SIGTRAP and then restart of this insn. */
7149      putPC(mkU64(guest_PC_curr_instr + 0));
7150      dres->whatNext    = Dis_StopHere;
7151      dres->jk_StopHere = Ijk_SigTRAP;
7152      DIP("brk #%u\n", imm16);
7153      return True;
7154   }
7155
7156   /* ------------------- YIELD ------------------- */
7157   /* 31        23        15        7
7158      1101 0101 0000 0011 0010 0000 0011 1111
7159   */
7160   if (INSN(31,0) == 0xD503203F) {
7161      /* Request yield followed by continuation at the next insn. */
7162      putPC(mkU64(guest_PC_curr_instr + 4));
7163      dres->whatNext    = Dis_StopHere;
7164      dres->jk_StopHere = Ijk_Yield;
7165      DIP("yield\n");
7166      return True;
7167   }
7168
7169   /* -------------------- HINT ------------------- */
7170   /* 31        23        15   11   4 3
7171      1101 0101 0000 0011 0010 imm7 1 1111
7172      Catch otherwise unhandled HINT instructions - any
7173      like YIELD which are explicitly handled should go
7174      above this case.
7175   */
7176   if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7177       && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7178       && INSN(15,12) == BITS4(0,0,1,0)
7179       && INSN(4,0) == BITS5(1,1,1,1,1)) {
7180      UInt imm7 = INSN(11,5);
7181      DIP("hint #%u\n", imm7);
7182      return True;
7183   }
7184
7185   /* ------------------- CLREX ------------------ */
7186   /* 31        23        15   11 7
7187      1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7188      CRm is apparently ignored.
7189   */
7190   if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7191      UInt mm = INSN(11,8);
7192      /* AFAICS, this simply cancels a (all?) reservations made by a
7193         (any?) preceding LDREX(es).  Arrange to hand it through to
7194         the back end. */
7195      if (abiinfo->guest__use_fallback_LLSC) {
7196         stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7197      } else {
7198         stmt( IRStmt_MBE(Imbe_CancelReservation) );
7199      }
7200      DIP("clrex #%u\n", mm);
7201      return True;
7202   }
7203
7204   vex_printf("ARM64 front end: branch_etc\n");
7205   return False;
7206#  undef INSN
7207}
7208
7209
7210/*------------------------------------------------------------*/
7211/*--- SIMD and FP instructions: helper functions           ---*/
7212/*------------------------------------------------------------*/
7213
7214/* Some constructors for interleave/deinterleave expressions. */
7215
7216static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7217   // returns a0 b0
7218   return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7219}
7220
7221static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7222   // returns a1 b1
7223   return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7224}
7225
7226static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7227   // returns a2 a0 b2 b0
7228   return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7229}
7230
7231static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7232   // returns a3 a1 b3 b1
7233   return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7234}
7235
7236static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7237   // returns a1 b1 a0 b0
7238   return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7239}
7240
7241static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7242   // returns a3 b3 a2 b2
7243   return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7244}
7245
7246static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7247   // returns a6 a4 a2 a0 b6 b4 b2 b0
7248   return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7249}
7250
7251static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7252   // returns a7 a5 a3 a1 b7 b5 b3 b1
7253   return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7254}
7255
7256static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7257   // returns a3 b3 a2 b2 a1 b1 a0 b0
7258   return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7259}
7260
7261static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7262   // returns a7 b7 a6 b6 a5 b5 a4 b4
7263   return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7264}
7265
7266static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7267                                     IRTemp bFEDCBA9876543210 ) {
7268   // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7269   return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7270                                      mkexpr(bFEDCBA9876543210));
7271}
7272
7273static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7274                                    IRTemp bFEDCBA9876543210 ) {
7275   // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7276   return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7277                                     mkexpr(bFEDCBA9876543210));
7278}
7279
7280static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7281                                     IRTemp bFEDCBA9876543210 ) {
7282   // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7283   return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7284                                      mkexpr(bFEDCBA9876543210));
7285}
7286
7287static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7288                                     IRTemp bFEDCBA9876543210 ) {
7289   // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7290   return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7291                                      mkexpr(bFEDCBA9876543210));
7292}
7293
7294/* Generate N copies of |bit| in the bottom of a ULong. */
7295static ULong Replicate ( ULong bit, Int N )
7296{
7297   vassert(bit <= 1 && N >= 1 && N < 64);
7298   if (bit == 0) {
7299      return 0;
7300    } else {
7301      /* Careful.  This won't work for N == 64. */
7302      return (1ULL << N) - 1;
7303   }
7304}
7305
7306static ULong Replicate32x2 ( ULong bits32 )
7307{
7308   vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7309   return (bits32 << 32) | bits32;
7310}
7311
7312static ULong Replicate16x4 ( ULong bits16 )
7313{
7314   vassert(0 == (bits16 & ~0xFFFFULL));
7315   return Replicate32x2((bits16 << 16) | bits16);
7316}
7317
7318static ULong Replicate8x8 ( ULong bits8 )
7319{
7320   vassert(0 == (bits8 & ~0xFFULL));
7321   return Replicate16x4((bits8 << 8) | bits8);
7322}
7323
7324/* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
7325   |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
7326   is 64.  In the former case, the upper 32 bits of the returned value
7327   are guaranteed to be zero. */
7328static ULong VFPExpandImm ( ULong imm8, Int N )
7329{
7330   vassert(imm8 <= 0xFF);
7331   vassert(N == 32 || N == 64);
7332   Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
7333   Int F = N - E - 1;
7334   ULong imm8_6 = (imm8 >> 6) & 1;
7335   /* sign: 1 bit */
7336   /* exp:  E bits */
7337   /* frac: F bits */
7338   ULong sign = (imm8 >> 7) & 1;
7339   ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
7340   ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
7341   vassert(sign < (1ULL << 1));
7342   vassert(exp  < (1ULL << E));
7343   vassert(frac < (1ULL << F));
7344   vassert(1 + E + F == N);
7345   ULong res = (sign << (E+F)) | (exp << F) | frac;
7346   return res;
7347}
7348
7349/* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
7350   This might fail, as indicated by the returned Bool.  Page 2530 of
7351   the manual. */
7352static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
7353                               UInt op, UInt cmode, UInt imm8 )
7354{
7355   vassert(op <= 1);
7356   vassert(cmode <= 15);
7357   vassert(imm8 <= 255);
7358
7359   *res = 0; /* will overwrite iff returning True */
7360
7361   ULong imm64    = 0;
7362   Bool  testimm8 = False;
7363
7364   switch (cmode >> 1) {
7365      case 0:
7366         testimm8 = False; imm64 = Replicate32x2(imm8); break;
7367      case 1:
7368         testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
7369      case 2:
7370         testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
7371      case 3:
7372         testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
7373      case 4:
7374          testimm8 = False; imm64 = Replicate16x4(imm8); break;
7375      case 5:
7376          testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
7377      case 6:
7378          testimm8 = True;
7379          if ((cmode & 1) == 0)
7380              imm64 = Replicate32x2((imm8 << 8) | 0xFF);
7381          else
7382              imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
7383          break;
7384      case 7:
7385         testimm8 = False;
7386         if ((cmode & 1) == 0 && op == 0)
7387             imm64 = Replicate8x8(imm8);
7388         if ((cmode & 1) == 0 && op == 1) {
7389             imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
7390             imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
7391             imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
7392             imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
7393             imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
7394             imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
7395             imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
7396             imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
7397         }
7398         if ((cmode & 1) == 1 && op == 0) {
7399            ULong imm8_7  = (imm8 >> 7) & 1;
7400            ULong imm8_6  = (imm8 >> 6) & 1;
7401            ULong imm8_50 = imm8 & 63;
7402            ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
7403                          | ((imm8_6 ^ 1)         << (5 + 6 + 19))
7404                          | (Replicate(imm8_6, 5) << (6 + 19))
7405                          | (imm8_50              << 19);
7406            imm64 = Replicate32x2(imm32);
7407         }
7408         if ((cmode & 1) == 1 && op == 1) {
7409            // imm64 = imm8<7>:NOT(imm8<6>)
7410            //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
7411            ULong imm8_7  = (imm8 >> 7) & 1;
7412            ULong imm8_6  = (imm8 >> 6) & 1;
7413            ULong imm8_50 = imm8 & 63;
7414            imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
7415                    | (Replicate(imm8_6, 8) << 54)
7416                    | (imm8_50 << 48);
7417         }
7418         break;
7419      default:
7420        vassert(0);
7421   }
7422
7423   if (testimm8 && imm8 == 0)
7424      return False;
7425
7426   *res = imm64;
7427   return True;
7428}
7429
7430/* Help a bit for decoding laneage for vector operations that can be
7431   of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
7432   and SZ bits, typically for vector floating point. */
7433static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
7434                               /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
7435                               /*OUT*/const HChar** arrSpec,
7436                               Bool bitQ, Bool bitSZ )
7437{
7438   vassert(bitQ == True || bitQ == False);
7439   vassert(bitSZ == True || bitSZ == False);
7440   if (bitQ && bitSZ) { // 2x64
7441      if (tyI)       *tyI       = Ity_I64;
7442      if (tyF)       *tyF       = Ity_F64;
7443      if (nLanes)    *nLanes    = 2;
7444      if (zeroUpper) *zeroUpper = False;
7445      if (arrSpec)   *arrSpec   = "2d";
7446      return True;
7447   }
7448   if (bitQ && !bitSZ) { // 4x32
7449      if (tyI)       *tyI       = Ity_I32;
7450      if (tyF)       *tyF       = Ity_F32;
7451      if (nLanes)    *nLanes    = 4;
7452      if (zeroUpper) *zeroUpper = False;
7453      if (arrSpec)   *arrSpec   = "4s";
7454      return True;
7455   }
7456   if (!bitQ && !bitSZ) { // 2x32
7457      if (tyI)       *tyI       = Ity_I32;
7458      if (tyF)       *tyF       = Ity_F32;
7459      if (nLanes)    *nLanes    = 2;
7460      if (zeroUpper) *zeroUpper = True;
7461      if (arrSpec)   *arrSpec   = "2s";
7462      return True;
7463   }
7464   // Else impliedly 1x64, which isn't allowed.
7465   return False;
7466}
7467
7468/* Helper for decoding laneage for shift-style vector operations
7469   that involve an immediate shift amount. */
7470static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
7471                                    UInt immh, UInt immb )
7472{
7473   vassert(immh < (1<<4));
7474   vassert(immb < (1<<3));
7475   UInt immhb = (immh << 3) | immb;
7476   if (immh & 8) {
7477      if (shift)  *shift  = 128 - immhb;
7478      if (szBlg2) *szBlg2 = 3;
7479      return True;
7480   }
7481   if (immh & 4) {
7482      if (shift)  *shift  = 64 - immhb;
7483      if (szBlg2) *szBlg2 = 2;
7484      return True;
7485   }
7486   if (immh & 2) {
7487      if (shift)  *shift  = 32 - immhb;
7488      if (szBlg2) *szBlg2 = 1;
7489      return True;
7490   }
7491   if (immh & 1) {
7492      if (shift)  *shift  = 16 - immhb;
7493      if (szBlg2) *szBlg2 = 0;
7494      return True;
7495   }
7496   return False;
7497}
7498
7499/* Generate IR to fold all lanes of the V128 value in 'src' as
7500   characterised by the operator 'op', and return the result in the
7501   bottom bits of a V128, with all other bits set to zero. */
7502static IRTemp math_FOLDV ( IRTemp src, IROp op )
7503{
7504   /* The basic idea is to use repeated applications of Iop_CatEven*
7505      and Iop_CatOdd* operators to 'src' so as to clone each lane into
7506      a complete vector.  Then fold all those vectors with 'op' and
7507      zero out all but the least significant lane. */
7508   switch (op) {
7509      case Iop_Min8Sx16: case Iop_Min8Ux16:
7510      case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
7511         /* NB: temp naming here is misleading -- the naming is for 8
7512            lanes of 16 bit, whereas what is being operated on is 16
7513            lanes of 8 bits. */
7514         IRTemp x76543210 = src;
7515         IRTemp x76547654 = newTempV128();
7516         IRTemp x32103210 = newTempV128();
7517         assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7518         assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7519         IRTemp x76767676 = newTempV128();
7520         IRTemp x54545454 = newTempV128();
7521         IRTemp x32323232 = newTempV128();
7522         IRTemp x10101010 = newTempV128();
7523         assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7524         assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7525         assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7526         assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7527         IRTemp x77777777 = newTempV128();
7528         IRTemp x66666666 = newTempV128();
7529         IRTemp x55555555 = newTempV128();
7530         IRTemp x44444444 = newTempV128();
7531         IRTemp x33333333 = newTempV128();
7532         IRTemp x22222222 = newTempV128();
7533         IRTemp x11111111 = newTempV128();
7534         IRTemp x00000000 = newTempV128();
7535         assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7536         assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7537         assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7538         assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7539         assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7540         assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7541         assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7542         assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7543         /* Naming not misleading after here. */
7544         IRTemp xAllF = newTempV128();
7545         IRTemp xAllE = newTempV128();
7546         IRTemp xAllD = newTempV128();
7547         IRTemp xAllC = newTempV128();
7548         IRTemp xAllB = newTempV128();
7549         IRTemp xAllA = newTempV128();
7550         IRTemp xAll9 = newTempV128();
7551         IRTemp xAll8 = newTempV128();
7552         IRTemp xAll7 = newTempV128();
7553         IRTemp xAll6 = newTempV128();
7554         IRTemp xAll5 = newTempV128();
7555         IRTemp xAll4 = newTempV128();
7556         IRTemp xAll3 = newTempV128();
7557         IRTemp xAll2 = newTempV128();
7558         IRTemp xAll1 = newTempV128();
7559         IRTemp xAll0 = newTempV128();
7560         assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
7561         assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
7562         assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
7563         assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
7564         assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
7565         assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
7566         assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
7567         assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
7568         assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
7569         assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
7570         assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
7571         assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
7572         assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
7573         assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
7574         assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
7575         assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
7576         IRTemp maxFE = newTempV128();
7577         IRTemp maxDC = newTempV128();
7578         IRTemp maxBA = newTempV128();
7579         IRTemp max98 = newTempV128();
7580         IRTemp max76 = newTempV128();
7581         IRTemp max54 = newTempV128();
7582         IRTemp max32 = newTempV128();
7583         IRTemp max10 = newTempV128();
7584         assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
7585         assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
7586         assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
7587         assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
7588         assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
7589         assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
7590         assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
7591         assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
7592         IRTemp maxFEDC = newTempV128();
7593         IRTemp maxBA98 = newTempV128();
7594         IRTemp max7654 = newTempV128();
7595         IRTemp max3210 = newTempV128();
7596         assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
7597         assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
7598         assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7599         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7600         IRTemp maxFEDCBA98 = newTempV128();
7601         IRTemp max76543210 = newTempV128();
7602         assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
7603         assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7604         IRTemp maxAllLanes = newTempV128();
7605         assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
7606                                       mkexpr(max76543210)));
7607         IRTemp res = newTempV128();
7608         assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
7609         return res;
7610      }
7611      case Iop_Min16Sx8: case Iop_Min16Ux8:
7612      case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
7613         IRTemp x76543210 = src;
7614         IRTemp x76547654 = newTempV128();
7615         IRTemp x32103210 = newTempV128();
7616         assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7617         assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7618         IRTemp x76767676 = newTempV128();
7619         IRTemp x54545454 = newTempV128();
7620         IRTemp x32323232 = newTempV128();
7621         IRTemp x10101010 = newTempV128();
7622         assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7623         assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7624         assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7625         assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7626         IRTemp x77777777 = newTempV128();
7627         IRTemp x66666666 = newTempV128();
7628         IRTemp x55555555 = newTempV128();
7629         IRTemp x44444444 = newTempV128();
7630         IRTemp x33333333 = newTempV128();
7631         IRTemp x22222222 = newTempV128();
7632         IRTemp x11111111 = newTempV128();
7633         IRTemp x00000000 = newTempV128();
7634         assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7635         assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7636         assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7637         assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7638         assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7639         assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7640         assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7641         assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7642         IRTemp max76 = newTempV128();
7643         IRTemp max54 = newTempV128();
7644         IRTemp max32 = newTempV128();
7645         IRTemp max10 = newTempV128();
7646         assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
7647         assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
7648         assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
7649         assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
7650         IRTemp max7654 = newTempV128();
7651         IRTemp max3210 = newTempV128();
7652         assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7653         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7654         IRTemp max76543210 = newTempV128();
7655         assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7656         IRTemp res = newTempV128();
7657         assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
7658         return res;
7659      }
7660      case Iop_Max32Fx4: case Iop_Min32Fx4:
7661      case Iop_Min32Sx4: case Iop_Min32Ux4:
7662      case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
7663         IRTemp x3210 = src;
7664         IRTemp x3232 = newTempV128();
7665         IRTemp x1010 = newTempV128();
7666         assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
7667         assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
7668         IRTemp x3333 = newTempV128();
7669         IRTemp x2222 = newTempV128();
7670         IRTemp x1111 = newTempV128();
7671         IRTemp x0000 = newTempV128();
7672         assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
7673         assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
7674         assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
7675         assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
7676         IRTemp max32 = newTempV128();
7677         IRTemp max10 = newTempV128();
7678         assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
7679         assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
7680         IRTemp max3210 = newTempV128();
7681         assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7682         IRTemp res = newTempV128();
7683         assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
7684         return res;
7685      }
7686      case Iop_Add64x2: {
7687         IRTemp x10 = src;
7688         IRTemp x00 = newTempV128();
7689         IRTemp x11 = newTempV128();
7690         assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
7691         assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
7692         IRTemp max10 = newTempV128();
7693         assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
7694         IRTemp res = newTempV128();
7695         assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
7696         return res;
7697      }
7698      default:
7699         vassert(0);
7700   }
7701}
7702
7703
7704/* Generate IR for TBL and TBX.  This deals with the 128 bit case
7705   only. */
7706static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
7707                             IRTemp oor_values )
7708{
7709   vassert(len >= 0 && len <= 3);
7710
7711   /* Generate some useful constants as concisely as possible. */
7712   IRTemp half15 = newTemp(Ity_I64);
7713   assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
7714   IRTemp half16 = newTemp(Ity_I64);
7715   assign(half16, mkU64(0x1010101010101010ULL));
7716
7717   /* A zero vector */
7718   IRTemp allZero = newTempV128();
7719   assign(allZero, mkV128(0x0000));
7720   /* A vector containing 15 in each 8-bit lane */
7721   IRTemp all15 = newTempV128();
7722   assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
7723   /* A vector containing 16 in each 8-bit lane */
7724   IRTemp all16 = newTempV128();
7725   assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
7726   /* A vector containing 32 in each 8-bit lane */
7727   IRTemp all32 = newTempV128();
7728   assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
7729   /* A vector containing 48 in each 8-bit lane */
7730   IRTemp all48 = newTempV128();
7731   assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
7732   /* A vector containing 64 in each 8-bit lane */
7733   IRTemp all64 = newTempV128();
7734   assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
7735
7736   /* Group the 16/32/48/64 vectors so as to be indexable. */
7737   IRTemp allXX[4] = { all16, all32, all48, all64 };
7738
7739   /* Compute the result for each table vector, with zeroes in places
7740      where the index values are out of range, and OR them into the
7741      running vector. */
7742   IRTemp running_result = newTempV128();
7743   assign(running_result, mkV128(0));
7744
7745   UInt tabent;
7746   for (tabent = 0; tabent <= len; tabent++) {
7747      vassert(tabent >= 0 && tabent < 4);
7748      IRTemp bias = newTempV128();
7749      assign(bias,
7750             mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
7751      IRTemp biased_indices = newTempV128();
7752      assign(biased_indices,
7753             binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
7754      IRTemp valid_mask = newTempV128();
7755      assign(valid_mask,
7756             binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
7757      IRTemp safe_biased_indices = newTempV128();
7758      assign(safe_biased_indices,
7759             binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
7760      IRTemp results_or_junk = newTempV128();
7761      assign(results_or_junk,
7762             binop(Iop_Perm8x16, mkexpr(tab[tabent]),
7763                                 mkexpr(safe_biased_indices)));
7764      IRTemp results_or_zero = newTempV128();
7765      assign(results_or_zero,
7766             binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
7767      /* And OR that into the running result. */
7768      IRTemp tmp = newTempV128();
7769      assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
7770                        mkexpr(running_result)));
7771      running_result = tmp;
7772   }
7773
7774   /* So now running_result holds the overall result where the indices
7775      are in range, and zero in out-of-range lanes.  Now we need to
7776      compute an overall validity mask and use this to copy in the
7777      lanes in the oor_values for out of range indices.  This is
7778      unnecessary for TBL but will get folded out by iropt, so we lean
7779      on that and generate the same code for TBL and TBX here. */
7780   IRTemp overall_valid_mask = newTempV128();
7781   assign(overall_valid_mask,
7782          binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
7783   IRTemp result = newTempV128();
7784   assign(result,
7785          binop(Iop_OrV128,
7786                mkexpr(running_result),
7787                binop(Iop_AndV128,
7788                      mkexpr(oor_values),
7789                      unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
7790   return result;
7791}
7792
7793
7794/* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
7795   an op which takes two I64s and produces a V128.  That is, a widening
7796   operator.  Generate IR which applies |opI64x2toV128| to either the
7797   lower (if |is2| is False) or upper (if |is2| is True) halves of
7798   |argL| and |argR|, and return the value in a new IRTemp.
7799*/
7800static
7801IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
7802                                   IRExpr* argL, IRExpr* argR )
7803{
7804   IRTemp res   = newTempV128();
7805   IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
7806   assign(res, binop(opI64x2toV128, unop(slice, argL),
7807                                    unop(slice, argR)));
7808   return res;
7809}
7810
7811
7812/* Generate signed/unsigned absolute difference vector IR. */
7813static
7814IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
7815{
7816   vassert(size <= 3);
7817   IRTemp argL = newTempV128();
7818   IRTemp argR = newTempV128();
7819   IRTemp msk  = newTempV128();
7820   IRTemp res  = newTempV128();
7821   assign(argL, argLE);
7822   assign(argR, argRE);
7823   assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
7824                     mkexpr(argL), mkexpr(argR)));
7825   assign(res,
7826          binop(Iop_OrV128,
7827                binop(Iop_AndV128,
7828                      binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
7829                      mkexpr(msk)),
7830                binop(Iop_AndV128,
7831                      binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
7832                      unop(Iop_NotV128, mkexpr(msk)))));
7833   return res;
7834}
7835
7836
7837/* Generate IR that takes a V128 and sign- or zero-widens
7838   either the lower or upper set of lanes to twice-as-wide,
7839   resulting in a new V128 value. */
7840static
7841IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
7842                                   UInt sizeNarrow, IRExpr* srcE )
7843{
7844   IRTemp src = newTempV128();
7845   IRTemp res = newTempV128();
7846   assign(src, srcE);
7847   switch (sizeNarrow) {
7848      case X10:
7849         assign(res,
7850                binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
7851                      binop(fromUpperHalf ? Iop_InterleaveHI32x4
7852                                          : Iop_InterleaveLO32x4,
7853                            mkexpr(src),
7854                            mkexpr(src)),
7855                      mkU8(32)));
7856         break;
7857      case X01:
7858         assign(res,
7859                binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
7860                      binop(fromUpperHalf ? Iop_InterleaveHI16x8
7861                                          : Iop_InterleaveLO16x8,
7862                            mkexpr(src),
7863                            mkexpr(src)),
7864                      mkU8(16)));
7865         break;
7866      case X00:
7867         assign(res,
7868                binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
7869                      binop(fromUpperHalf ? Iop_InterleaveHI8x16
7870                                          : Iop_InterleaveLO8x16,
7871                            mkexpr(src),
7872                            mkexpr(src)),
7873                      mkU8(8)));
7874         break;
7875      default:
7876         vassert(0);
7877   }
7878   return res;
7879}
7880
7881
7882/* Generate IR that takes a V128 and sign- or zero-widens
7883   either the even or odd lanes to twice-as-wide,
7884   resulting in a new V128 value. */
7885static
7886IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
7887                                      UInt sizeNarrow, IRExpr* srcE )
7888{
7889   IRTemp src   = newTempV128();
7890   IRTemp res   = newTempV128();
7891   IROp   opSAR = mkVecSARN(sizeNarrow+1);
7892   IROp   opSHR = mkVecSHRN(sizeNarrow+1);
7893   IROp   opSHL = mkVecSHLN(sizeNarrow+1);
7894   IROp   opSxR = zWiden ? opSHR : opSAR;
7895   UInt   amt   = 0;
7896   switch (sizeNarrow) {
7897      case X10: amt = 32; break;
7898      case X01: amt = 16; break;
7899      case X00: amt = 8;  break;
7900      default: vassert(0);
7901   }
7902   assign(src, srcE);
7903   if (fromOdd) {
7904      assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
7905   } else {
7906      assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
7907                               mkU8(amt)));
7908   }
7909   return res;
7910}
7911
7912
7913/* Generate IR that takes two V128s and narrows (takes lower half)
7914   of each lane, producing a single V128 value. */
7915static
7916IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
7917{
7918   IRTemp res = newTempV128();
7919   assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
7920                     mkexpr(argHi), mkexpr(argLo)));
7921   return res;
7922}
7923
7924
7925/* Return a temp which holds the vector dup of the lane of width
7926   (1 << size) obtained from src[laneNo]. */
7927static
7928IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
7929{
7930   vassert(size <= 3);
7931   /* Normalise |laneNo| so it is of the form
7932      x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
7933      This puts the bits we want to inspect at constant offsets
7934      regardless of the value of |size|.
7935   */
7936   UInt ix = laneNo << size;
7937   vassert(ix <= 15);
7938   IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
7939   switch (size) {
7940      case 0: /* B */
7941         ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
7942         /* fallthrough */
7943      case 1: /* H */
7944         ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
7945         /* fallthrough */
7946      case 2: /* S */
7947         ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
7948         /* fallthrough */
7949      case 3: /* D */
7950         ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
7951         break;
7952      default:
7953         vassert(0);
7954   }
7955   IRTemp res = newTempV128();
7956   assign(res, src);
7957   Int i;
7958   for (i = 3; i >= 0; i--) {
7959      if (ops[i] == Iop_INVALID)
7960         break;
7961      IRTemp tmp = newTempV128();
7962      assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
7963      res = tmp;
7964   }
7965   return res;
7966}
7967
7968
7969/* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
7970   selector encoded as shown below.  Return a new V128 holding the
7971   selected lane from |srcV| dup'd out to V128, and also return the
7972   lane number, log2 of the lane size in bytes, and width-character via
7973   *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
7974   is an invalid selector, in which case return
7975   IRTemp_INVALID, 0, 0 and '?' respectively.
7976
7977   imm5 = xxxx1   signifies .b[xxxx]
7978        = xxx10   .h[xxx]
7979        = xx100   .s[xx]
7980        = x1000   .d[x]
7981        otherwise invalid
7982*/
7983static
7984IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
7985                             /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
7986                             IRExpr* srcV, UInt imm5 )
7987{
7988   *laneNo    = 0;
7989   *laneSzLg2 = 0;
7990   *laneCh    = '?';
7991
7992   if (imm5 & 1) {
7993      *laneNo    = (imm5 >> 1) & 15;
7994      *laneSzLg2 = 0;
7995      *laneCh    = 'b';
7996   }
7997   else if (imm5 & 2) {
7998      *laneNo    = (imm5 >> 2) & 7;
7999      *laneSzLg2 = 1;
8000      *laneCh    = 'h';
8001   }
8002   else if (imm5 & 4) {
8003      *laneNo    = (imm5 >> 3) & 3;
8004      *laneSzLg2 = 2;
8005      *laneCh    = 's';
8006   }
8007   else if (imm5 & 8) {
8008      *laneNo    = (imm5 >> 4) & 1;
8009      *laneSzLg2 = 3;
8010      *laneCh    = 'd';
8011   }
8012   else {
8013      /* invalid */
8014      return IRTemp_INVALID;
8015   }
8016
8017   return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8018}
8019
8020
8021/* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8022static
8023IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8024{
8025   IRType ty  = Ity_INVALID;
8026   IRTemp rcS = IRTemp_INVALID;
8027   switch (size) {
8028      case X01:
8029         vassert(imm <= 0xFFFFULL);
8030         ty  = Ity_I16;
8031         rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8032         break;
8033      case X10:
8034         vassert(imm <= 0xFFFFFFFFULL);
8035         ty  = Ity_I32;
8036         rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8037         break;
8038      case X11:
8039         ty  = Ity_I64;
8040         rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8041      default:
8042         vassert(0);
8043   }
8044   IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8045   return rcV;
8046}
8047
8048
8049/* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8050   and the upper can contain any value -- it is ignored.  If |is2| is False,
8051   generate IR to put |new64| in the lower half of vector reg |dd| and zero
8052   the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8053   half of vector reg |dd| and leave the lower half unchanged.  This
8054   simulates the behaviour of the "foo/foo2" instructions in which the
8055   destination is half the width of sources, for example addhn/addhn2.
8056*/
8057static
8058void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8059{
8060   if (is2) {
8061      /* Get the old contents of Vdd, zero the upper half, and replace
8062         it with 'x'. */
8063      IRTemp t_zero_oldLO = newTempV128();
8064      assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8065      IRTemp t_newHI_zero = newTempV128();
8066      assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8067                                                       mkV128(0x0000)));
8068      IRTemp res = newTempV128();
8069      assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8070                                    mkexpr(t_newHI_zero)));
8071      putQReg128(dd, mkexpr(res));
8072   } else {
8073      /* This is simple. */
8074      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8075   }
8076}
8077
8078
8079/* Compute vector SQABS at lane size |size| for |srcE|, returning
8080   the q result in |*qabs| and the normal result in |*nabs|. */
8081static
8082void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8083                  IRExpr* srcE, UInt size )
8084{
8085      IRTemp src, mask, maskn, nsub, qsub;
8086      src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8087      newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8088      assign(src,   srcE);
8089      assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8090      assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8091      assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8092      assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8093      assign(*nabs, binop(Iop_OrV128,
8094                          binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8095                          binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8096      assign(*qabs, binop(Iop_OrV128,
8097                          binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8098                          binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8099}
8100
8101
8102/* Compute vector SQNEG at lane size |size| for |srcE|, returning
8103   the q result in |*qneg| and the normal result in |*nneg|. */
8104static
8105void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8106                  IRExpr* srcE, UInt size )
8107{
8108      IRTemp src = IRTemp_INVALID;
8109      newTempsV128_3(&src, nneg, qneg);
8110      assign(src,   srcE);
8111      assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8112      assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8113}
8114
8115
8116/* Zero all except the least significant lane of |srcE|, where |size|
8117   indicates the lane size in the usual way. */
8118static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8119{
8120   vassert(size < 4);
8121   IRTemp t = newTempV128();
8122   assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8123   return t;
8124}
8125
8126
8127/* Generate IR to compute vector widening MULL from either the lower
8128   (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8129   widening multiplies are unsigned when isU==True and signed when
8130   isU==False.  |size| is the narrow lane size indication.  Optionally,
8131   the product may be added to or subtracted from vecD, at the wide lane
8132   size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8133   is 'm' (only multiply) then the accumulate part does not happen, and
8134   |vecD| is expected to == IRTemp_INVALID.
8135
8136   Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8137   are allowed.  The result is returned in a new IRTemp, which is
8138   returned in *res. */
8139static
8140void math_MULL_ACC ( /*OUT*/IRTemp* res,
8141                     Bool is2, Bool isU, UInt size, HChar mas,
8142                     IRTemp vecN, IRTemp vecM, IRTemp vecD )
8143{
8144   vassert(res && *res == IRTemp_INVALID);
8145   vassert(size <= 2);
8146   vassert(mas == 'm' || mas == 'a' || mas == 's');
8147   if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8148   IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8149   IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8150                  : (mas == 's' ? mkVecSUB(size+1)
8151                  : Iop_INVALID);
8152   IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8153                                            mkexpr(vecN), mkexpr(vecM));
8154   *res = newTempV128();
8155   assign(*res, mas == 'm' ? mkexpr(mul)
8156                           : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8157}
8158
8159
8160/* Same as math_MULL_ACC, except the multiply is signed widening,
8161   the multiplied value is then doubled, before being added to or
8162   subtracted from the accumulated value.  And everything is
8163   saturated.  In all cases, saturation residuals are returned
8164   via (sat1q, sat1n), and in the accumulate cases,
8165   via (sat2q, sat2n) too.  All results are returned in new temporaries.
8166   In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8167   so the caller can tell this has happened. */
8168static
8169void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8170                        /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8171                        /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8172                        Bool is2, UInt size, HChar mas,
8173                        IRTemp vecN, IRTemp vecM, IRTemp vecD )
8174{
8175   vassert(size <= 2);
8176   vassert(mas == 'm' || mas == 'a' || mas == 's');
8177   /* Compute
8178         sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8179         sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8180      IOW take either the low or high halves of vecN and vecM, signed widen,
8181      multiply, double that, and signedly saturate.  Also compute the same
8182      but without saturation.
8183   */
8184   vassert(sat2q && *sat2q == IRTemp_INVALID);
8185   vassert(sat2n && *sat2n == IRTemp_INVALID);
8186   newTempsV128_3(sat1q, sat1n, res);
8187   IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8188                                         mkexpr(vecN), mkexpr(vecM));
8189   IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8190                                         mkexpr(vecN), mkexpr(vecM));
8191   assign(*sat1q, mkexpr(tq));
8192   assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8193
8194   /* If there is no accumulation, the final result is sat1q,
8195      and there's no assignment to sat2q or sat2n. */
8196   if (mas == 'm') {
8197      assign(*res, mkexpr(*sat1q));
8198      return;
8199   }
8200
8201   /* Compute
8202         sat2q  = vecD +sq/-sq sat1q
8203         sat2n  = vecD +/-     sat1n
8204         result = sat2q
8205   */
8206   newTempsV128_2(sat2q, sat2n);
8207   assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8208                        mkexpr(vecD), mkexpr(*sat1q)));
8209   assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8210                        mkexpr(vecD), mkexpr(*sat1n)));
8211   assign(*res, mkexpr(*sat2q));
8212}
8213
8214
8215/* Generate IR for widening signed vector multiplies.  The operands
8216   have their lane width signedly widened, and they are then multiplied
8217   at the wider width, returning results in two new IRTemps. */
8218static
8219void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8220                  UInt sizeNarrow, IRTemp argL, IRTemp argR )
8221{
8222   vassert(sizeNarrow <= 2);
8223   newTempsV128_2(resHI, resLO);
8224   IRTemp argLhi = newTemp(Ity_I64);
8225   IRTemp argLlo = newTemp(Ity_I64);
8226   IRTemp argRhi = newTemp(Ity_I64);
8227   IRTemp argRlo = newTemp(Ity_I64);
8228   assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8229   assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8230   assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8231   assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8232   IROp opMulls = mkVecMULLS(sizeNarrow);
8233   assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8234   assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8235}
8236
8237
8238/* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8239   double that, possibly add a rounding constant (R variants), and take
8240   the high half. */
8241static
8242void math_SQDMULH ( /*OUT*/IRTemp* res,
8243                    /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8244                    Bool isR, UInt size, IRTemp vN, IRTemp vM )
8245{
8246   vassert(size == X01 || size == X10); /* s or h only */
8247
8248   newTempsV128_3(res, sat1q, sat1n);
8249
8250   IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8251   math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8252
8253   IRTemp addWide = mkVecADD(size+1);
8254
8255   if (isR) {
8256      assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8257
8258      Int    rcShift    = size == X01 ? 15 : 31;
8259      IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8260      assign(*sat1n,
8261             binop(mkVecCATODDLANES(size),
8262                   binop(addWide,
8263                         binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8264                         mkexpr(roundConst)),
8265                   binop(addWide,
8266                         binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8267                         mkexpr(roundConst))));
8268   } else {
8269      assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8270
8271      assign(*sat1n,
8272             binop(mkVecCATODDLANES(size),
8273                   binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8274                   binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8275   }
8276
8277   assign(*res, mkexpr(*sat1q));
8278}
8279
8280
8281/* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8282   a new temp in *res, and the Q difference pair in new temps in
8283   *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8284   three operations it is. */
8285static
8286void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8287                     /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8288                     IRTemp src, UInt size, UInt shift, const HChar* nm )
8289{
8290   vassert(size <= 3);
8291   UInt laneBits = 8 << size;
8292   vassert(shift < laneBits);
8293   newTempsV128_3(res, qDiff1, qDiff2);
8294   IRTemp z128 = newTempV128();
8295   assign(z128, mkV128(0x0000));
8296
8297   /* UQSHL */
8298   if (vex_streq(nm, "uqshl")) {
8299      IROp qop = mkVecQSHLNSATUU(size);
8300      assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8301      if (shift == 0) {
8302         /* No shift means no saturation. */
8303         assign(*qDiff1, mkexpr(z128));
8304         assign(*qDiff2, mkexpr(z128));
8305      } else {
8306         /* Saturation has occurred if any of the shifted-out bits are
8307            nonzero.  We get the shifted-out bits by right-shifting the
8308            original value. */
8309         UInt rshift = laneBits - shift;
8310         vassert(rshift >= 1 && rshift < laneBits);
8311         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8312         assign(*qDiff2, mkexpr(z128));
8313      }
8314      return;
8315   }
8316
8317   /* SQSHL */
8318   if (vex_streq(nm, "sqshl")) {
8319      IROp qop = mkVecQSHLNSATSS(size);
8320      assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8321      if (shift == 0) {
8322         /* No shift means no saturation. */
8323         assign(*qDiff1, mkexpr(z128));
8324         assign(*qDiff2, mkexpr(z128));
8325      } else {
8326         /* Saturation has occurred if any of the shifted-out bits are
8327            different from the top bit of the original value. */
8328         UInt rshift = laneBits - 1 - shift;
8329         vassert(rshift >= 0 && rshift < laneBits-1);
8330         /* qDiff1 is the shifted out bits, and the top bit of the original
8331            value, preceded by zeroes. */
8332         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8333         /* qDiff2 is the top bit of the original value, cloned the
8334            correct number of times. */
8335         assign(*qDiff2, binop(mkVecSHRN(size),
8336                               binop(mkVecSARN(size), mkexpr(src),
8337                                                      mkU8(laneBits-1)),
8338                               mkU8(rshift)));
8339         /* This also succeeds in comparing the top bit of the original
8340            value to itself, which is a bit stupid, but not wrong. */
8341      }
8342      return;
8343   }
8344
8345   /* SQSHLU */
8346   if (vex_streq(nm, "sqshlu")) {
8347      IROp qop = mkVecQSHLNSATSU(size);
8348      assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8349      if (shift == 0) {
8350         /* If there's no shift, saturation depends on the top bit
8351            of the source. */
8352         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
8353         assign(*qDiff2, mkexpr(z128));
8354      } else {
8355         /* Saturation has occurred if any of the shifted-out bits are
8356            nonzero.  We get the shifted-out bits by right-shifting the
8357            original value. */
8358         UInt rshift = laneBits - shift;
8359         vassert(rshift >= 1 && rshift < laneBits);
8360         assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8361         assign(*qDiff2, mkexpr(z128));
8362      }
8363      return;
8364   }
8365
8366   vassert(0);
8367}
8368
8369
8370/* Generate IR to do SRHADD and URHADD. */
8371static
8372IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
8373{
8374   /* Generate this:
8375      (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
8376   */
8377   vassert(size <= 3);
8378   IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
8379   IROp opADD = mkVecADD(size);
8380   /* The only tricky bit is to generate the correct vector 1 constant. */
8381   const ULong ones64[4]
8382      = { 0x0101010101010101ULL, 0x0001000100010001ULL,
8383          0x0000000100000001ULL, 0x0000000000000001ULL };
8384   IRTemp imm64 = newTemp(Ity_I64);
8385   assign(imm64, mkU64(ones64[size]));
8386   IRTemp vecOne = newTempV128();
8387   assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
8388   IRTemp scaOne = newTemp(Ity_I8);
8389   assign(scaOne, mkU8(1));
8390   IRTemp res = newTempV128();
8391   assign(res,
8392          binop(opADD,
8393                binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
8394                binop(opADD,
8395                      binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
8396                      binop(opSHR,
8397                            binop(opADD,
8398                                  binop(opADD,
8399                                        binop(Iop_AndV128, mkexpr(aa),
8400                                                           mkexpr(vecOne)),
8401                                        binop(Iop_AndV128, mkexpr(bb),
8402                                                           mkexpr(vecOne))
8403                                  ),
8404                                  mkexpr(vecOne)
8405                            ),
8406                            mkexpr(scaOne)
8407                      )
8408                )
8409          )
8410   );
8411   return res;
8412}
8413
8414
8415/* QCFLAG tracks the SIMD sticky saturation status.  Update the status
8416   thusly: if, after application of |opZHI| to both |qres| and |nres|,
8417   they have the same value, leave QCFLAG unchanged.  Otherwise, set it
8418   (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
8419   operators, or Iop_INVALID, in which case |qres| and |nres| are used
8420   unmodified.  The presence |opZHI| means this function can be used to
8421   generate QCFLAG update code for both scalar and vector SIMD operations.
8422*/
8423static
8424void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
8425{
8426   IRTemp diff      = newTempV128();
8427   IRTemp oldQCFLAG = newTempV128();
8428   IRTemp newQCFLAG = newTempV128();
8429   if (opZHI == Iop_INVALID) {
8430      assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
8431   } else {
8432      vassert(opZHI == Iop_ZeroHI64ofV128
8433              || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
8434      assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
8435   }
8436   assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
8437   assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
8438   stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
8439}
8440
8441
8442/* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
8443   are used unmodified, hence suitable for QCFLAG updates for whole-vector
8444   operations. */
8445static
8446void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
8447{
8448   updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
8449}
8450
8451
8452/* Generate IR to rearrange two vector values in a way which is useful
8453   for doing S/D add-pair etc operations.  There are 3 cases:
8454
8455   2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
8456
8457   4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
8458
8459   2s:  [m2 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
8460
8461   The cases are distinguished as follows:
8462   isD == True,  bitQ == 1  =>  2d
8463   isD == False, bitQ == 1  =>  4s
8464   isD == False, bitQ == 0  =>  2s
8465*/
8466static
8467void math_REARRANGE_FOR_FLOATING_PAIRWISE (
8468        /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
8469        IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
8470     )
8471{
8472   vassert(rearrL && *rearrL == IRTemp_INVALID);
8473   vassert(rearrR && *rearrR == IRTemp_INVALID);
8474   *rearrL = newTempV128();
8475   *rearrR = newTempV128();
8476   if (isD) {
8477      // 2d case
8478      vassert(bitQ == 1);
8479      assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
8480      assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
8481   }
8482   else if (!isD && bitQ == 1) {
8483      // 4s case
8484      assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
8485      assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
8486   } else {
8487      // 2s case
8488      vassert(!isD && bitQ == 0);
8489      IRTemp m1n1m0n0 = newTempV128();
8490      IRTemp m0n0m1n1 = newTempV128();
8491      assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
8492                             mkexpr(vecM), mkexpr(vecN)));
8493      assign(m0n0m1n1, triop(Iop_SliceV128,
8494                             mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
8495      assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
8496      assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
8497   }
8498}
8499
8500
8501/* Returns 2.0 ^ (-n) for n in 1 .. 64 */
8502static Double two_to_the_minus ( Int n )
8503{
8504   if (n == 1) return 0.5;
8505   vassert(n >= 2 && n <= 64);
8506   Int half = n / 2;
8507   return two_to_the_minus(half) * two_to_the_minus(n - half);
8508}
8509
8510
8511/* Returns 2.0 ^ n for n in 1 .. 64 */
8512static Double two_to_the_plus ( Int n )
8513{
8514   if (n == 1) return 2.0;
8515   vassert(n >= 2 && n <= 64);
8516   Int half = n / 2;
8517   return two_to_the_plus(half) * two_to_the_plus(n - half);
8518}
8519
8520
8521/*------------------------------------------------------------*/
8522/*--- SIMD and FP instructions                             ---*/
8523/*------------------------------------------------------------*/
8524
8525static
8526Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
8527{
8528   /* 31  29     23  21 20 15 14   10 9 4
8529      0 q 101110 op2 0  m  0  imm4 0  n d
8530      Decode fields: op2
8531   */
8532#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8533   if (INSN(31,31) != 0
8534       || INSN(29,24) != BITS6(1,0,1,1,1,0)
8535       || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
8536      return False;
8537   }
8538   UInt bitQ = INSN(30,30);
8539   UInt op2  = INSN(23,22);
8540   UInt mm   = INSN(20,16);
8541   UInt imm4 = INSN(14,11);
8542   UInt nn   = INSN(9,5);
8543   UInt dd   = INSN(4,0);
8544
8545   if (op2 == BITS2(0,0)) {
8546      /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
8547      IRTemp sHi = newTempV128();
8548      IRTemp sLo = newTempV128();
8549      IRTemp res = newTempV128();
8550      assign(sHi, getQReg128(mm));
8551      assign(sLo, getQReg128(nn));
8552      if (bitQ == 1) {
8553         if (imm4 == 0) {
8554            assign(res, mkexpr(sLo));
8555         } else {
8556            vassert(imm4 >= 1 && imm4 <= 15);
8557            assign(res, triop(Iop_SliceV128,
8558                              mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
8559         }
8560         putQReg128(dd, mkexpr(res));
8561         DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
8562      } else {
8563         if (imm4 >= 8) return False;
8564         if (imm4 == 0) {
8565            assign(res, mkexpr(sLo));
8566         } else {
8567            vassert(imm4 >= 1 && imm4 <= 7);
8568            IRTemp hi64lo64 = newTempV128();
8569            assign(hi64lo64, binop(Iop_InterleaveLO64x2,
8570                                   mkexpr(sHi), mkexpr(sLo)));
8571            assign(res, triop(Iop_SliceV128,
8572                              mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
8573         }
8574         putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
8575         DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
8576      }
8577      return True;
8578   }
8579
8580   return False;
8581#  undef INSN
8582}
8583
8584
8585static
8586Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
8587{
8588   /* 31  29     23  21 20 15 14  12 11 9 4
8589      0 q 001110 op2 0  m  0  len op 00 n d
8590      Decode fields: op2,len,op
8591   */
8592#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8593   if (INSN(31,31) != 0
8594       || INSN(29,24) != BITS6(0,0,1,1,1,0)
8595       || INSN(21,21) != 0
8596       || INSN(15,15) != 0
8597       || INSN(11,10) != BITS2(0,0)) {
8598      return False;
8599   }
8600   UInt bitQ  = INSN(30,30);
8601   UInt op2   = INSN(23,22);
8602   UInt mm    = INSN(20,16);
8603   UInt len   = INSN(14,13);
8604   UInt bitOP = INSN(12,12);
8605   UInt nn    = INSN(9,5);
8606   UInt dd    = INSN(4,0);
8607
8608   if (op2 == X00) {
8609      /* -------- 00,xx,0 TBL, xx register table -------- */
8610      /* -------- 00,xx,1 TBX, xx register table -------- */
8611      /* 31  28        20 15 14  12  9 4
8612         0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8613         0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8614         where Ta = 16b(q=1) or 8b(q=0)
8615      */
8616      Bool isTBX = bitOP == 1;
8617      /* The out-of-range values to use. */
8618      IRTemp oor_values = newTempV128();
8619      assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
8620      /* src value */
8621      IRTemp src = newTempV128();
8622      assign(src, getQReg128(mm));
8623      /* The table values */
8624      IRTemp tab[4];
8625      UInt   i;
8626      for (i = 0; i <= len; i++) {
8627         vassert(i < 4);
8628         tab[i] = newTempV128();
8629         assign(tab[i], getQReg128((nn + i) % 32));
8630      }
8631      IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
8632      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8633      const HChar* Ta = bitQ ==1 ? "16b" : "8b";
8634      const HChar* nm = isTBX ? "tbx" : "tbl";
8635      DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
8636          nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
8637      return True;
8638   }
8639
8640#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8641   return False;
8642#  undef INSN
8643}
8644
8645
8646static
8647Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
8648{
8649   /* 31  29     23   21 20 15 14     11 9 4
8650      0 q 001110 size 0  m  0  opcode 10 n d
8651      Decode fields: opcode
8652   */
8653#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8654   if (INSN(31,31) != 0
8655       || INSN(29,24) != BITS6(0,0,1,1,1,0)
8656       || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
8657      return False;
8658   }
8659   UInt bitQ   = INSN(30,30);
8660   UInt size   = INSN(23,22);
8661   UInt mm     = INSN(20,16);
8662   UInt opcode = INSN(14,12);
8663   UInt nn     = INSN(9,5);
8664   UInt dd     = INSN(4,0);
8665
8666   if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
8667      /* -------- 001 UZP1 std7_std7_std7 -------- */
8668      /* -------- 101 UZP2 std7_std7_std7 -------- */
8669      if (bitQ == 0 && size == X11) return False; // implied 1d case
8670      Bool   isUZP1 = opcode == BITS3(0,0,1);
8671      IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
8672                             : mkVecCATODDLANES(size);
8673      IRTemp preL = newTempV128();
8674      IRTemp preR = newTempV128();
8675      IRTemp res  = newTempV128();
8676      if (bitQ == 0) {
8677         assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
8678                                                  getQReg128(nn)));
8679         assign(preR, mkexpr(preL));
8680      } else {
8681         assign(preL, getQReg128(mm));
8682         assign(preR, getQReg128(nn));
8683      }
8684      assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8685      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8686      const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
8687      const HChar* arr = nameArr_Q_SZ(bitQ, size);
8688      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8689          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8690      return True;
8691   }
8692
8693   if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
8694      /* -------- 010 TRN1 std7_std7_std7 -------- */
8695      /* -------- 110 TRN2 std7_std7_std7 -------- */
8696      if (bitQ == 0 && size == X11) return False; // implied 1d case
8697      Bool   isTRN1 = opcode == BITS3(0,1,0);
8698      IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
8699                             : mkVecCATODDLANES(size);
8700      IROp op2 = mkVecINTERLEAVEHI(size);
8701      IRTemp srcM = newTempV128();
8702      IRTemp srcN = newTempV128();
8703      IRTemp res  = newTempV128();
8704      assign(srcM, getQReg128(mm));
8705      assign(srcN, getQReg128(nn));
8706      assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
8707                             binop(op1, mkexpr(srcN), mkexpr(srcN))));
8708      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8709      const HChar* nm  = isTRN1 ? "trn1" : "trn2";
8710      const HChar* arr = nameArr_Q_SZ(bitQ, size);
8711      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8712          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8713      return True;
8714   }
8715
8716   if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
8717      /* -------- 011 ZIP1 std7_std7_std7 -------- */
8718      /* -------- 111 ZIP2 std7_std7_std7 -------- */
8719      if (bitQ == 0 && size == X11) return False; // implied 1d case
8720      Bool   isZIP1 = opcode == BITS3(0,1,1);
8721      IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
8722                             : mkVecINTERLEAVEHI(size);
8723      IRTemp preL = newTempV128();
8724      IRTemp preR = newTempV128();
8725      IRTemp res  = newTempV128();
8726      if (bitQ == 0 && !isZIP1) {
8727         IRTemp z128 = newTempV128();
8728         assign(z128, mkV128(0x0000));
8729         // preL = Vm shifted left 32 bits
8730         // preR = Vn shifted left 32 bits
8731         assign(preL, triop(Iop_SliceV128,
8732                            getQReg128(mm), mkexpr(z128), mkU8(12)));
8733         assign(preR, triop(Iop_SliceV128,
8734                            getQReg128(nn), mkexpr(z128), mkU8(12)));
8735
8736      } else {
8737         assign(preL, getQReg128(mm));
8738         assign(preR, getQReg128(nn));
8739      }
8740      assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8741      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8742      const HChar* nm  = isZIP1 ? "zip1" : "zip2";
8743      const HChar* arr = nameArr_Q_SZ(bitQ, size);
8744      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8745          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8746      return True;
8747   }
8748
8749   return False;
8750#  undef INSN
8751}
8752
8753
8754static
8755Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
8756{
8757   /* 31    28    23   21    16     11 9 4
8758      0 q u 01110 size 11000 opcode 10 n d
8759      Decode fields: u,size,opcode
8760   */
8761#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8762   if (INSN(31,31) != 0
8763       || INSN(28,24) != BITS5(0,1,1,1,0)
8764       || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
8765      return False;
8766   }
8767   UInt bitQ   = INSN(30,30);
8768   UInt bitU   = INSN(29,29);
8769   UInt size   = INSN(23,22);
8770   UInt opcode = INSN(16,12);
8771   UInt nn     = INSN(9,5);
8772   UInt dd     = INSN(4,0);
8773
8774   if (opcode == BITS5(0,0,0,1,1)) {
8775      /* -------- 0,xx,00011 SADDLV -------- */
8776      /* -------- 1,xx,00011 UADDLV -------- */
8777      /* size is the narrow size */
8778      if (size == X11 || (size == X10 && bitQ == 0)) return False;
8779      Bool   isU = bitU == 1;
8780      IRTemp src = newTempV128();
8781      assign(src, getQReg128(nn));
8782      /* The basic plan is to widen the lower half, and if Q = 1,
8783         the upper half too.  Add them together (if Q = 1), and in
8784         either case fold with add at twice the lane width.
8785      */
8786      IRExpr* widened
8787         = mkexpr(math_WIDEN_LO_OR_HI_LANES(
8788                     isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
8789      if (bitQ == 1) {
8790         widened
8791            = binop(mkVecADD(size+1),
8792                    widened,
8793                    mkexpr(math_WIDEN_LO_OR_HI_LANES(
8794                              isU, True/*fromUpperHalf*/, size, mkexpr(src)))
8795              );
8796      }
8797      /* Now fold. */
8798      IRTemp tWi = newTempV128();
8799      assign(tWi, widened);
8800      IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
8801      putQReg128(dd, mkexpr(res));
8802      const HChar* arr = nameArr_Q_SZ(bitQ, size);
8803      const HChar  ch  = "bhsd"[size];
8804      DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
8805          nameQReg128(dd), ch, nameQReg128(nn), arr);
8806      return True;
8807   }
8808
8809   UInt ix = 0;
8810   /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
8811   else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
8812   else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
8813   /**/
8814   if (ix != 0) {
8815      /* -------- 0,xx,01010: SMAXV -------- (1) */
8816      /* -------- 1,xx,01010: UMAXV -------- (2) */
8817      /* -------- 0,xx,11010: SMINV -------- (3) */
8818      /* -------- 1,xx,11010: UMINV -------- (4) */
8819      /* -------- 0,xx,11011: ADDV  -------- (5) */
8820      vassert(ix >= 1 && ix <= 5);
8821      if (size == X11) return False; // 1d,2d cases not allowed
8822      if (size == X10 && bitQ == 0) return False; // 2s case not allowed
8823      const IROp opMAXS[3]
8824         = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
8825      const IROp opMAXU[3]
8826         = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
8827      const IROp opMINS[3]
8828         = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
8829      const IROp opMINU[3]
8830         = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
8831      const IROp opADD[3]
8832         = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
8833      vassert(size < 3);
8834      IROp op = Iop_INVALID;
8835      const HChar* nm = NULL;
8836      switch (ix) {
8837         case 1: op = opMAXS[size]; nm = "smaxv"; break;
8838         case 2: op = opMAXU[size]; nm = "umaxv"; break;
8839         case 3: op = opMINS[size]; nm = "sminv"; break;
8840         case 4: op = opMINU[size]; nm = "uminv"; break;
8841         case 5: op = opADD[size];  nm = "addv";  break;
8842         default: vassert(0);
8843      }
8844      vassert(op != Iop_INVALID && nm != NULL);
8845      IRTemp tN1 = newTempV128();
8846      assign(tN1, getQReg128(nn));
8847      /* If Q == 0, we're just folding lanes in the lower half of
8848         the value.  In which case, copy the lower half of the
8849         source into the upper half, so we can then treat it the
8850         same as the full width case.  Except for the addition case,
8851         in which we have to zero out the upper half. */
8852      IRTemp tN2 = newTempV128();
8853      assign(tN2, bitQ == 0
8854                     ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
8855                                : mk_CatEvenLanes64x2(tN1,tN1))
8856                     : mkexpr(tN1));
8857      IRTemp res = math_FOLDV(tN2, op);
8858      if (res == IRTemp_INVALID)
8859         return False; /* means math_FOLDV
8860                          doesn't handle this case yet */
8861      putQReg128(dd, mkexpr(res));
8862      const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
8863      IRType laneTy = tys[size];
8864      const HChar* arr = nameArr_Q_SZ(bitQ, size);
8865      DIP("%s %s, %s.%s\n", nm,
8866          nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
8867      return True;
8868   }
8869
8870   if ((size == X00 || size == X10)
8871       && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
8872      /* -------- 0,00,01100: FMAXMNV s_4s -------- */
8873      /* -------- 0,10,01100: FMINMNV s_4s -------- */
8874      /* -------- 1,00,01111: FMAXV   s_4s -------- */
8875      /* -------- 1,10,01111: FMINV   s_4s -------- */
8876      /* FMAXNM, FMINNM: FIXME -- KLUDGED */
8877      if (bitQ == 0) return False; // Only 4s is allowed
8878      Bool   isMIN = (size & 2) == 2;
8879      Bool   isNM  = opcode == BITS5(0,1,1,0,0);
8880      IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
8881      IRTemp src = newTempV128();
8882      assign(src, getQReg128(nn));
8883      IRTemp res = math_FOLDV(src, opMXX);
8884      putQReg128(dd, mkexpr(res));
8885      DIP("%s%sv s%u, %u.4s\n",
8886          isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
8887      return True;
8888   }
8889
8890#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8891   return False;
8892#  undef INSN
8893}
8894
8895
8896static
8897Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
8898{
8899   /* 31     28       20   15 14   10 9 4
8900      0 q op 01110000 imm5 0  imm4 1  n d
8901      Decode fields: q,op,imm4
8902   */
8903#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8904   if (INSN(31,31) != 0
8905       || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
8906       || INSN(15,15) != 0 || INSN(10,10) != 1) {
8907      return False;
8908   }
8909   UInt bitQ  = INSN(30,30);
8910   UInt bitOP = INSN(29,29);
8911   UInt imm5  = INSN(20,16);
8912   UInt imm4  = INSN(14,11);
8913   UInt nn    = INSN(9,5);
8914   UInt dd    = INSN(4,0);
8915
8916   /* -------- x,0,0000: DUP (element, vector) -------- */
8917   /* 31  28       20   15     9 4
8918      0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
8919   */
8920   if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
8921      UInt   laneNo    = 0;
8922      UInt   laneSzLg2 = 0;
8923      HChar  laneCh    = '?';
8924      IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
8925                                             getQReg128(nn), imm5);
8926      if (res == IRTemp_INVALID)
8927         return False;
8928      if (bitQ == 0 && laneSzLg2 == X11)
8929         return False; /* .1d case */
8930      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8931      const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
8932      DIP("dup %s.%s, %s.%c[%u]\n",
8933           nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
8934      return True;
8935   }
8936
8937   /* -------- x,0,0001: DUP (general, vector) -------- */
8938   /* 31  28       20   15       9 4
8939      0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
8940      Q=0 writes 64, Q=1 writes 128
8941      imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
8942            xxx10  4H(q=0)      or 8H(q=1),      R=W
8943            xx100  2S(q=0)      or 4S(q=1),      R=W
8944            x1000  Invalid(q=0) or 2D(q=1),      R=X
8945            x0000  Invalid(q=0) or Invalid(q=1)
8946      Require op=0, imm4=0001
8947   */
8948   if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
8949      Bool   isQ = bitQ == 1;
8950      IRTemp w0  = newTemp(Ity_I64);
8951      const HChar* arT = "??";
8952      IRType laneTy = Ity_INVALID;
8953      if (imm5 & 1) {
8954         arT    = isQ ? "16b" : "8b";
8955         laneTy = Ity_I8;
8956         assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
8957      }
8958      else if (imm5 & 2) {
8959         arT    = isQ ? "8h" : "4h";
8960         laneTy = Ity_I16;
8961         assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
8962      }
8963      else if (imm5 & 4) {
8964         arT    = isQ ? "4s" : "2s";
8965         laneTy = Ity_I32;
8966         assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
8967      }
8968      else if ((imm5 & 8) && isQ) {
8969         arT    = "2d";
8970         laneTy = Ity_I64;
8971         assign(w0, getIReg64orZR(nn));
8972      }
8973      else {
8974         /* invalid; leave laneTy unchanged. */
8975      }
8976      /* */
8977      if (laneTy != Ity_INVALID) {
8978         IRTemp w1 = math_DUP_TO_64(w0, laneTy);
8979         putQReg128(dd, binop(Iop_64HLtoV128,
8980                              isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
8981         DIP("dup %s.%s, %s\n",
8982             nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
8983         return True;
8984      }
8985      /* invalid */
8986      return False;
8987   }
8988
8989   /* -------- 1,0,0011: INS (general) -------- */
8990   /* 31  28       20   15     9 4
8991      010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
8992      where Ts,ix = case imm5 of xxxx1 -> B, xxxx
8993                                 xxx10 -> H, xxx
8994                                 xx100 -> S, xx
8995                                 x1000 -> D, x
8996   */
8997   if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
8998      HChar   ts     = '?';
8999      UInt    laneNo = 16;
9000      IRExpr* src    = NULL;
9001      if (imm5 & 1) {
9002         src    = unop(Iop_64to8, getIReg64orZR(nn));
9003         laneNo = (imm5 >> 1) & 15;
9004         ts     = 'b';
9005      }
9006      else if (imm5 & 2) {
9007         src    = unop(Iop_64to16, getIReg64orZR(nn));
9008         laneNo = (imm5 >> 2) & 7;
9009         ts     = 'h';
9010      }
9011      else if (imm5 & 4) {
9012         src    = unop(Iop_64to32, getIReg64orZR(nn));
9013         laneNo = (imm5 >> 3) & 3;
9014         ts     = 's';
9015      }
9016      else if (imm5 & 8) {
9017         src    = getIReg64orZR(nn);
9018         laneNo = (imm5 >> 4) & 1;
9019         ts     = 'd';
9020      }
9021      /* */
9022      if (src) {
9023         vassert(laneNo < 16);
9024         putQRegLane(dd, laneNo, src);
9025         DIP("ins %s.%c[%u], %s\n",
9026             nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9027         return True;
9028      }
9029      /* invalid */
9030      return False;
9031   }
9032
9033   /* -------- x,0,0101: SMOV -------- */
9034   /* -------- x,0,0111: UMOV -------- */
9035   /* 31  28        20   15     9 4
9036      0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9037      0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9038      dest is Xd when q==1, Wd when q==0
9039      UMOV:
9040         Ts,index,ops = case q:imm5 of
9041                          0:xxxx1 -> B, xxxx, 8Uto64
9042                          1:xxxx1 -> invalid
9043                          0:xxx10 -> H, xxx,  16Uto64
9044                          1:xxx10 -> invalid
9045                          0:xx100 -> S, xx,   32Uto64
9046                          1:xx100 -> invalid
9047                          1:x1000 -> D, x,    copy64
9048                          other   -> invalid
9049      SMOV:
9050         Ts,index,ops = case q:imm5 of
9051                          0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9052                          1:xxxx1 -> B, xxxx, 8Sto64
9053                          0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9054                          1:xxx10 -> H, xxx,  16Sto64
9055                          0:xx100 -> invalid
9056                          1:xx100 -> S, xx,   32Sto64
9057                          1:x1000 -> invalid
9058                          other   -> invalid
9059   */
9060   if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9061      Bool isU  = (imm4 & 2) == 2;
9062      const HChar* arTs = "??";
9063      UInt    laneNo = 16; /* invalid */
9064      // Setting 'res' to non-NULL determines valid/invalid
9065      IRExpr* res    = NULL;
9066      if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9067         laneNo = (imm5 >> 1) & 15;
9068         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9069         res = isU ? unop(Iop_8Uto64, lane)
9070                   : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9071         arTs = "b";
9072      }
9073      else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9074         laneNo = (imm5 >> 1) & 15;
9075         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9076         res = isU ? NULL
9077                   : unop(Iop_8Sto64, lane);
9078         arTs = "b";
9079      }
9080      else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9081         laneNo = (imm5 >> 2) & 7;
9082         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9083         res = isU ? unop(Iop_16Uto64, lane)
9084                   : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9085         arTs = "h";
9086      }
9087      else if (bitQ && (imm5 & 2)) { // 1:xxx10
9088         laneNo = (imm5 >> 2) & 7;
9089         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9090         res = isU ? NULL
9091                   : unop(Iop_16Sto64, lane);
9092         arTs = "h";
9093      }
9094      else if (!bitQ && (imm5 & 4)) { // 0:xx100
9095         laneNo = (imm5 >> 3) & 3;
9096         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9097         res = isU ? unop(Iop_32Uto64, lane)
9098                   : NULL;
9099         arTs = "s";
9100      }
9101      else if (bitQ && (imm5 & 4)) { // 1:xxx10
9102         laneNo = (imm5 >> 3) & 3;
9103         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9104         res = isU ? NULL
9105                   : unop(Iop_32Sto64, lane);
9106         arTs = "s";
9107      }
9108      else if (bitQ && (imm5 & 8)) { // 1:x1000
9109         laneNo = (imm5 >> 4) & 1;
9110         IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9111         res = isU ? lane
9112                   : NULL;
9113         arTs = "d";
9114      }
9115      /* */
9116      if (res) {
9117         vassert(laneNo < 16);
9118         putIReg64orZR(dd, res);
9119         DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9120             nameIRegOrZR(bitQ == 1, dd),
9121             nameQReg128(nn), arTs, laneNo);
9122         return True;
9123      }
9124      /* invalid */
9125      return False;
9126   }
9127
9128   /* -------- 1,1,xxxx: INS (element) -------- */
9129   /* 31  28       20     14   9 4
9130      011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9131      where Ts,ix1,ix2
9132               = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9133                              xxx10 -> H, xxx,  imm4[3:1]
9134                              xx100 -> S, xx,   imm4[3:2]
9135                              x1000 -> D, x,    imm4[3:3]
9136   */
9137   if (bitQ == 1 && bitOP == 1) {
9138      HChar   ts  = '?';
9139      IRType  ity = Ity_INVALID;
9140      UInt    ix1 = 16;
9141      UInt    ix2 = 16;
9142      if (imm5 & 1) {
9143         ts  = 'b';
9144         ity = Ity_I8;
9145         ix1 = (imm5 >> 1) & 15;
9146         ix2 = (imm4 >> 0) & 15;
9147      }
9148      else if (imm5 & 2) {
9149         ts  = 'h';
9150         ity = Ity_I16;
9151         ix1 = (imm5 >> 2) & 7;
9152         ix2 = (imm4 >> 1) & 7;
9153      }
9154      else if (imm5 & 4) {
9155         ts  = 's';
9156         ity = Ity_I32;
9157         ix1 = (imm5 >> 3) & 3;
9158         ix2 = (imm4 >> 2) & 3;
9159      }
9160      else if (imm5 & 8) {
9161         ts  = 'd';
9162         ity = Ity_I64;
9163         ix1 = (imm5 >> 4) & 1;
9164         ix2 = (imm4 >> 3) & 1;
9165      }
9166      /* */
9167      if (ity != Ity_INVALID) {
9168         vassert(ix1 < 16);
9169         vassert(ix2 < 16);
9170         putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9171         DIP("ins %s.%c[%u], %s.%c[%u]\n",
9172             nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9173         return True;
9174      }
9175      /* invalid */
9176      return False;
9177   }
9178
9179   return False;
9180#  undef INSN
9181}
9182
9183
9184static
9185Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9186{
9187   /* 31    28          18  15    11 9     4
9188      0q op 01111 00000 abc cmode 01 defgh d
9189      Decode fields: q,op,cmode
9190      Bit 11 is really "o2", but it is always zero.
9191   */
9192#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9193   if (INSN(31,31) != 0
9194       || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9195       || INSN(11,10) != BITS2(0,1)) {
9196      return False;
9197   }
9198   UInt bitQ     = INSN(30,30);
9199   UInt bitOP    = INSN(29,29);
9200   UInt cmode    = INSN(15,12);
9201   UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9202   UInt dd       = INSN(4,0);
9203
9204   ULong imm64lo  = 0;
9205   UInt  op_cmode = (bitOP << 4) | cmode;
9206   Bool  ok       = False;
9207   Bool  isORR    = False;
9208   Bool  isBIC    = False;
9209   Bool  isMOV    = False;
9210   Bool  isMVN    = False;
9211   Bool  isFMOV   = False;
9212   switch (op_cmode) {
9213      /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9214      /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9215      /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9216      /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9217      case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9218      case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9219         ok = True; isMOV = True; break;
9220
9221      /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9222      /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9223      /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9224      /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9225      case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9226      case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9227         ok = True; isORR = True; break;
9228
9229      /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9230      /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9231      case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9232         ok = True; isMOV = True; break;
9233
9234      /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9235      /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9236      case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9237         ok = True; isORR = True; break;
9238
9239      /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9240      /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9241      case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9242         ok = True; isMOV = True; break;
9243
9244      /* -------- x,0,1110 MOVI 8-bit -------- */
9245      case BITS5(0,1,1,1,0):
9246         ok = True; isMOV = True; break;
9247
9248      /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9249      case BITS5(0,1,1,1,1): // 0:1111
9250         ok = True; isFMOV = True; break;
9251
9252      /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9253      /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9254      /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9255      /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9256      case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9257      case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9258         ok = True; isMVN = True; break;
9259
9260      /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9261      /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9262      /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9263      /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9264      case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9265      case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9266         ok = True; isBIC = True; break;
9267
9268      /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9269      /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9270      case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
9271         ok = True; isMVN = True; break;
9272
9273      /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
9274      /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
9275      case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
9276         ok = True; isBIC = True; break;
9277
9278      /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
9279      /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
9280      case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
9281         ok = True; isMVN = True; break;
9282
9283      /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
9284      /* -------- 1,1,1110 MOVI 64-bit vector -------- */
9285      case BITS5(1,1,1,1,0):
9286         ok = True; isMOV = True; break;
9287
9288      /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
9289      case BITS5(1,1,1,1,1): // 1:1111
9290         ok = bitQ == 1; isFMOV = True; break;
9291
9292      default:
9293        break;
9294   }
9295   if (ok) {
9296      vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
9297                   + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
9298      ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
9299   }
9300   if (ok) {
9301      if (isORR || isBIC) {
9302         ULong inv
9303            = isORR ? 0ULL : ~0ULL;
9304         IRExpr* immV128
9305            = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
9306         IRExpr* res
9307            = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
9308         const HChar* nm = isORR ? "orr" : "bic";
9309         if (bitQ == 0) {
9310            putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
9311            DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
9312         } else {
9313            putQReg128(dd, res);
9314            DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
9315                nameQReg128(dd), imm64lo, imm64lo);
9316         }
9317      }
9318      else if (isMOV || isMVN || isFMOV) {
9319         if (isMVN) imm64lo = ~imm64lo;
9320         ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
9321         IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
9322                                                 mkU64(imm64lo));
9323         putQReg128(dd, immV128);
9324         DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
9325      }
9326      return True;
9327   }
9328   /* else fall through */
9329
9330   return False;
9331#  undef INSN
9332}
9333
9334
9335static
9336Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9337{
9338   /* 31    28       20   15 14   10 9 4
9339      01 op 11110000 imm5 0  imm4 1  n d
9340      Decode fields: op,imm4
9341   */
9342#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9343   if (INSN(31,30) != BITS2(0,1)
9344       || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
9345       || INSN(15,15) != 0 || INSN(10,10) != 1) {
9346      return False;
9347   }
9348   UInt bitOP = INSN(29,29);
9349   UInt imm5  = INSN(20,16);
9350   UInt imm4  = INSN(14,11);
9351   UInt nn    = INSN(9,5);
9352   UInt dd    = INSN(4,0);
9353
9354   if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9355      /* -------- 0,0000 DUP (element, scalar) -------- */
9356      IRTemp w0     = newTemp(Ity_I64);
9357      const HChar* arTs = "??";
9358      IRType laneTy = Ity_INVALID;
9359      UInt   laneNo = 16; /* invalid */
9360      if (imm5 & 1) {
9361         arTs   = "b";
9362         laneNo = (imm5 >> 1) & 15;
9363         laneTy = Ity_I8;
9364         assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
9365      }
9366      else if (imm5 & 2) {
9367         arTs   = "h";
9368         laneNo = (imm5 >> 2) & 7;
9369         laneTy = Ity_I16;
9370         assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
9371      }
9372      else if (imm5 & 4) {
9373         arTs   = "s";
9374         laneNo = (imm5 >> 3) & 3;
9375         laneTy = Ity_I32;
9376         assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
9377      }
9378      else if (imm5 & 8) {
9379         arTs   = "d";
9380         laneNo = (imm5 >> 4) & 1;
9381         laneTy = Ity_I64;
9382         assign(w0, getQRegLane(nn, laneNo, laneTy));
9383      }
9384      else {
9385         /* invalid; leave laneTy unchanged. */
9386      }
9387      /* */
9388      if (laneTy != Ity_INVALID) {
9389         vassert(laneNo < 16);
9390         putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
9391         DIP("dup %s, %s.%s[%u]\n",
9392             nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
9393         return True;
9394      }
9395      /* else fall through */
9396   }
9397
9398   return False;
9399#  undef INSN
9400}
9401
9402
9403static
9404Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn)
9405{
9406   /* 31   28    23 21    16     11 9 4
9407      01 u 11110 sz 11000 opcode 10 n d
9408      Decode fields: u,sz,opcode
9409   */
9410#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9411   if (INSN(31,30) != BITS2(0,1)
9412       || INSN(28,24) != BITS5(1,1,1,1,0)
9413       || INSN(21,17) != BITS5(1,1,0,0,0)
9414       || INSN(11,10) != BITS2(1,0)) {
9415      return False;
9416   }
9417   UInt bitU   = INSN(29,29);
9418   UInt sz     = INSN(23,22);
9419   UInt opcode = INSN(16,12);
9420   UInt nn     = INSN(9,5);
9421   UInt dd     = INSN(4,0);
9422
9423   if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
9424      /* -------- 0,11,11011 ADDP d_2d -------- */
9425      IRTemp xy = newTempV128();
9426      IRTemp xx = newTempV128();
9427      assign(xy, getQReg128(nn));
9428      assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
9429      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
9430                          binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
9431      DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
9432      return True;
9433   }
9434
9435   if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
9436      /* -------- 1,00,01101 ADDP s_2s -------- */
9437      /* -------- 1,01,01101 ADDP d_2d -------- */
9438      Bool   isD   = sz == X01;
9439      IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9440      IROp   opADD = mkVecADDF(isD ? 3 : 2);
9441      IRTemp src   = newTempV128();
9442      IRTemp argL  = newTempV128();
9443      IRTemp argR  = newTempV128();
9444      assign(src, getQReg128(nn));
9445      assign(argL, unop(opZHI, mkexpr(src)));
9446      assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9447                                                    mkU8(isD ? 8 : 4))));
9448      putQReg128(dd, unop(opZHI,
9449                          triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9450                                              mkexpr(argL), mkexpr(argR))));
9451      DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
9452      return True;
9453   }
9454
9455   if (bitU == 1
9456       && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9457      /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
9458      /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
9459      /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
9460      /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
9461      /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9462      Bool   isD   = (sz & 1) == 1;
9463      Bool   isMIN = (sz & 2) == 2;
9464      Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9465      IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9466      IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
9467      IRTemp src   = newTempV128();
9468      IRTemp argL  = newTempV128();
9469      IRTemp argR  = newTempV128();
9470      assign(src, getQReg128(nn));
9471      assign(argL, unop(opZHI, mkexpr(src)));
9472      assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9473                                                    mkU8(isD ? 8 : 4))));
9474      putQReg128(dd, unop(opZHI,
9475                          binop(opMXX, mkexpr(argL), mkexpr(argR))));
9476      HChar c = isD ? 'd' : 's';
9477      DIP("%s%sp %c%u, v%u.2%c\n",
9478           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
9479      return True;
9480   }
9481
9482   return False;
9483#  undef INSN
9484}
9485
9486
9487static
9488Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
9489{
9490   /* 31   28     22   18   15     10 9 4
9491      01 u 111110 immh immb opcode 1  n d
9492      Decode fields: u,immh,opcode
9493   */
9494#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9495   if (INSN(31,30) != BITS2(0,1)
9496       || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
9497      return False;
9498   }
9499   UInt bitU   = INSN(29,29);
9500   UInt immh   = INSN(22,19);
9501   UInt immb   = INSN(18,16);
9502   UInt opcode = INSN(15,11);
9503   UInt nn     = INSN(9,5);
9504   UInt dd     = INSN(4,0);
9505   UInt immhb  = (immh << 3) | immb;
9506
9507   if ((immh & 8) == 8
9508       && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
9509      /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
9510      /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
9511      /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
9512      /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
9513      Bool isU   = bitU == 1;
9514      Bool isAcc = opcode == BITS5(0,0,0,1,0);
9515      UInt sh    = 128 - immhb;
9516      vassert(sh >= 1 && sh <= 64);
9517      IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
9518      IRExpr* src = getQReg128(nn);
9519      IRTemp  shf = newTempV128();
9520      IRTemp  res = newTempV128();
9521      if (sh == 64 && isU) {
9522         assign(shf, mkV128(0x0000));
9523      } else {
9524         UInt nudge = 0;
9525         if (sh == 64) {
9526            vassert(!isU);
9527            nudge = 1;
9528         }
9529         assign(shf, binop(op, src, mkU8(sh - nudge)));
9530      }
9531      assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9532                        : mkexpr(shf));
9533      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9534      const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
9535                              : (isU ? "ushr" : "sshr");
9536      DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9537      return True;
9538   }
9539
9540   if ((immh & 8) == 8
9541       && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
9542      /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
9543      /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
9544      /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
9545      /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
9546      Bool isU   = bitU == 1;
9547      Bool isAcc = opcode == BITS5(0,0,1,1,0);
9548      UInt sh    = 128 - immhb;
9549      vassert(sh >= 1 && sh <= 64);
9550      IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
9551      vassert(sh >= 1 && sh <= 64);
9552      IRExpr* src  = getQReg128(nn);
9553      IRTemp  imm8 = newTemp(Ity_I8);
9554      assign(imm8, mkU8((UChar)(-sh)));
9555      IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
9556      IRTemp  shf  = newTempV128();
9557      IRTemp  res  = newTempV128();
9558      assign(shf, binop(op, src, amt));
9559      assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9560                        : mkexpr(shf));
9561      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9562      const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
9563                              : (isU ? "urshr" : "srshr");
9564      DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9565      return True;
9566   }
9567
9568   if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
9569      /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
9570      UInt sh = 128 - immhb;
9571      vassert(sh >= 1 && sh <= 64);
9572      if (sh == 64) {
9573         putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
9574      } else {
9575         /* sh is in range 1 .. 63 */
9576         ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
9577         IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9578         IRTemp  res    = newTempV128();
9579         assign(res, binop(Iop_OrV128,
9580                           binop(Iop_AndV128, getQReg128(dd), nmaskV),
9581                           binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
9582         putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9583      }
9584      DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
9585      return True;
9586   }
9587
9588   if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9589      /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
9590      UInt sh = immhb - 64;
9591      vassert(sh >= 0 && sh < 64);
9592      putQReg128(dd,
9593                 unop(Iop_ZeroHI64ofV128,
9594                      sh == 0 ? getQReg128(nn)
9595                              : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9596      DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
9597      return True;
9598   }
9599
9600   if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9601      /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
9602      UInt sh = immhb - 64;
9603      vassert(sh >= 0 && sh < 64);
9604      if (sh == 0) {
9605         putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
9606      } else {
9607         /* sh is in range 1 .. 63 */
9608         ULong   nmask  = (1ULL << sh) - 1;
9609         IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9610         IRTemp  res    = newTempV128();
9611         assign(res, binop(Iop_OrV128,
9612                           binop(Iop_AndV128, getQReg128(dd), nmaskV),
9613                           binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9614         putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9615      }
9616      DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
9617      return True;
9618   }
9619
9620   if (opcode == BITS5(0,1,1,1,0)
9621       || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
9622      /* -------- 0,01110  SQSHL  #imm -------- */
9623      /* -------- 1,01110  UQSHL  #imm -------- */
9624      /* -------- 1,01100  SQSHLU #imm -------- */
9625      UInt size  = 0;
9626      UInt shift = 0;
9627      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9628      if (!ok) return False;
9629      vassert(size >= 0 && size <= 3);
9630      /* The shift encoding has opposite sign for the leftwards case.
9631         Adjust shift to compensate. */
9632      UInt lanebits = 8 << size;
9633      shift = lanebits - shift;
9634      vassert(shift >= 0 && shift < lanebits);
9635      const HChar* nm = NULL;
9636      /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
9637      else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
9638      else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
9639      else vassert(0);
9640      IRTemp qDiff1 = IRTemp_INVALID;
9641      IRTemp qDiff2 = IRTemp_INVALID;
9642      IRTemp res = IRTemp_INVALID;
9643      IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
9644      /* This relies on the fact that the zeroed out lanes generate zeroed
9645         result lanes and don't saturate, so there's no point in trimming
9646         the resulting res, qDiff1 or qDiff2 values. */
9647      math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
9648      putQReg128(dd, mkexpr(res));
9649      updateQCFLAGwithDifference(qDiff1, qDiff2);
9650      const HChar arr = "bhsd"[size];
9651      DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
9652      return True;
9653   }
9654
9655   if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
9656       || (bitU == 1
9657           && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
9658      /* -------- 0,10010   SQSHRN #imm -------- */
9659      /* -------- 1,10010   UQSHRN #imm -------- */
9660      /* -------- 0,10011  SQRSHRN #imm -------- */
9661      /* -------- 1,10011  UQRSHRN #imm -------- */
9662      /* -------- 1,10000  SQSHRUN #imm -------- */
9663      /* -------- 1,10001 SQRSHRUN #imm -------- */
9664      UInt size  = 0;
9665      UInt shift = 0;
9666      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9667      if (!ok || size == X11) return False;
9668      vassert(size >= X00 && size <= X10);
9669      vassert(shift >= 1 && shift <= (8 << size));
9670      const HChar* nm = "??";
9671      IROp op = Iop_INVALID;
9672      /* Decide on the name and the operation. */
9673      /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
9674         nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
9675      }
9676      else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
9677         nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
9678      }
9679      else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
9680         nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
9681      }
9682      else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
9683         nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
9684      }
9685      else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
9686         nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
9687      }
9688      else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
9689         nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
9690      }
9691      else vassert(0);
9692      /* Compute the result (Q, shifted value) pair. */
9693      IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
9694      IRTemp pair   = newTempV128();
9695      assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
9696      /* Update the result reg */
9697      IRTemp res64in128 = newTempV128();
9698      assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
9699      putQReg128(dd, mkexpr(res64in128));
9700      /* Update the Q flag. */
9701      IRTemp q64q64 = newTempV128();
9702      assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
9703      IRTemp z128 = newTempV128();
9704      assign(z128, mkV128(0x0000));
9705      updateQCFLAGwithDifference(q64q64, z128);
9706      /* */
9707      const HChar arrNarrow = "bhsd"[size];
9708      const HChar arrWide   = "bhsd"[size+1];
9709      DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
9710      return True;
9711   }
9712
9713   if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
9714      /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
9715      /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
9716      UInt size  = 0;
9717      UInt fbits = 0;
9718      Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9719      /* The following holds because immh is never zero. */
9720      vassert(ok);
9721      /* The following holds because immh >= 0100. */
9722      vassert(size == X10 || size == X11);
9723      Bool isD = size == X11;
9724      Bool isU = bitU == 1;
9725      vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9726      Double  scale  = two_to_the_minus(fbits);
9727      IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9728                             : IRExpr_Const(IRConst_F32( (Float)scale ));
9729      IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9730      IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
9731                           : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
9732      IRType tyF = isD ? Ity_F64 : Ity_F32;
9733      IRType tyI = isD ? Ity_I64 : Ity_I32;
9734      IRTemp src = newTemp(tyI);
9735      IRTemp res = newTemp(tyF);
9736      IRTemp rm  = mk_get_IR_rounding_mode();
9737      assign(src, getQRegLane(nn, 0, tyI));
9738      assign(res, triop(opMUL, mkexpr(rm),
9739                               binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
9740      putQRegLane(dd, 0, mkexpr(res));
9741      if (!isD) {
9742         putQRegLane(dd, 1, mkU32(0));
9743      }
9744      putQRegLane(dd, 1, mkU64(0));
9745      const HChar ch = isD ? 'd' : 's';
9746      DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
9747          ch, dd, ch, nn, fbits);
9748      return True;
9749   }
9750
9751   if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
9752      /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
9753      /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
9754      UInt size  = 0;
9755      UInt fbits = 0;
9756      Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9757      /* The following holds because immh is never zero. */
9758      vassert(ok);
9759      /* The following holds because immh >= 0100. */
9760      vassert(size == X10 || size == X11);
9761      Bool isD = size == X11;
9762      Bool isU = bitU == 1;
9763      vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9764      Double  scale  = two_to_the_plus(fbits);
9765      IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9766                           : IRExpr_Const(IRConst_F32( (Float)scale ));
9767      IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9768      IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
9769                           : (isD ? Iop_F64toI64S : Iop_F32toI32S);
9770      IRType tyF = isD ? Ity_F64 : Ity_F32;
9771      IRType tyI = isD ? Ity_I64 : Ity_I32;
9772      IRTemp src = newTemp(tyF);
9773      IRTemp res = newTemp(tyI);
9774      IRTemp rm  = newTemp(Ity_I32);
9775      assign(src, getQRegLane(nn, 0, tyF));
9776      assign(rm,  mkU32(Irrm_ZERO));
9777      assign(res, binop(opCVT, mkexpr(rm),
9778                               triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
9779      putQRegLane(dd, 0, mkexpr(res));
9780      if (!isD) {
9781         putQRegLane(dd, 1, mkU32(0));
9782      }
9783      putQRegLane(dd, 1, mkU64(0));
9784      const HChar ch = isD ? 'd' : 's';
9785      DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
9786          ch, dd, ch, nn, fbits);
9787      return True;
9788   }
9789
9790#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9791   return False;
9792#  undef INSN
9793}
9794
9795
9796static
9797Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
9798{
9799   /* 31 29 28    23   21 20 15     11 9 4
9800      01 U  11110 size 1  m  opcode 00 n d
9801      Decode fields: u,opcode
9802   */
9803#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9804   if (INSN(31,30) != BITS2(0,1)
9805       || INSN(28,24) != BITS5(1,1,1,1,0)
9806       || INSN(21,21) != 1
9807       || INSN(11,10) != BITS2(0,0)) {
9808      return False;
9809   }
9810   UInt bitU   = INSN(29,29);
9811   UInt size   = INSN(23,22);
9812   UInt mm     = INSN(20,16);
9813   UInt opcode = INSN(15,12);
9814   UInt nn     = INSN(9,5);
9815   UInt dd     = INSN(4,0);
9816   vassert(size < 4);
9817
9818   if (bitU == 0
9819       && (opcode == BITS4(1,1,0,1)
9820           || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
9821      /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
9822      /* -------- 0,1001  SQDMLAL -------- */ // 1
9823      /* -------- 0,1011  SQDMLSL -------- */ // 2
9824      /* Widens, and size refers to the narrowed lanes. */
9825      UInt ks = 3;
9826      switch (opcode) {
9827         case BITS4(1,1,0,1): ks = 0; break;
9828         case BITS4(1,0,0,1): ks = 1; break;
9829         case BITS4(1,0,1,1): ks = 2; break;
9830         default: vassert(0);
9831      }
9832      vassert(ks >= 0 && ks <= 2);
9833      if (size == X00 || size == X11) return False;
9834      vassert(size <= 2);
9835      IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
9836      vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
9837      newTempsV128_3(&vecN, &vecM, &vecD);
9838      assign(vecN, getQReg128(nn));
9839      assign(vecM, getQReg128(mm));
9840      assign(vecD, getQReg128(dd));
9841      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
9842                       False/*!is2*/, size, "mas"[ks],
9843                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
9844      IROp opZHI = mkVecZEROHIxxOFV128(size+1);
9845      putQReg128(dd, unop(opZHI, mkexpr(res)));
9846      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
9847      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
9848      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
9849         updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
9850      }
9851      const HChar* nm        = ks == 0 ? "sqdmull"
9852                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
9853      const HChar  arrNarrow = "bhsd"[size];
9854      const HChar  arrWide   = "bhsd"[size+1];
9855      DIP("%s %c%u, %c%u, %c%u\n",
9856          nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
9857      return True;
9858   }
9859
9860   return False;
9861#  undef INSN
9862}
9863
9864
9865static
9866Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
9867{
9868   /* 31 29 28    23   21 20 15     10 9 4
9869      01 U  11110 size 1  m  opcode 1  n d
9870      Decode fields: u,size,opcode
9871   */
9872#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9873   if (INSN(31,30) != BITS2(0,1)
9874       || INSN(28,24) != BITS5(1,1,1,1,0)
9875       || INSN(21,21) != 1
9876       || INSN(10,10) != 1) {
9877      return False;
9878   }
9879   UInt bitU   = INSN(29,29);
9880   UInt size   = INSN(23,22);
9881   UInt mm     = INSN(20,16);
9882   UInt opcode = INSN(15,11);
9883   UInt nn     = INSN(9,5);
9884   UInt dd     = INSN(4,0);
9885   vassert(size < 4);
9886
9887   if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
9888      /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
9889      /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
9890      /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
9891      /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
9892      Bool isADD = opcode == BITS5(0,0,0,0,1);
9893      Bool isU   = bitU == 1;
9894      IROp qop   = Iop_INVALID;
9895      IROp nop   = Iop_INVALID;
9896      if (isADD) {
9897         qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
9898         nop = mkVecADD(size);
9899      } else {
9900         qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
9901         nop = mkVecSUB(size);
9902      }
9903      IRTemp argL = newTempV128();
9904      IRTemp argR = newTempV128();
9905      IRTemp qres = newTempV128();
9906      IRTemp nres = newTempV128();
9907      assign(argL, getQReg128(nn));
9908      assign(argR, getQReg128(mm));
9909      assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
9910                             size, binop(qop, mkexpr(argL), mkexpr(argR)))));
9911      assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
9912                             size, binop(nop, mkexpr(argL), mkexpr(argR)))));
9913      putQReg128(dd, mkexpr(qres));
9914      updateQCFLAGwithDifference(qres, nres);
9915      const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
9916                               : (isU ? "uqsub" : "sqsub");
9917      const HChar  arr = "bhsd"[size];
9918      DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
9919      return True;
9920   }
9921
9922   if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
9923      /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
9924      /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
9925      Bool    isGT = bitU == 0;
9926      IRExpr* argL = getQReg128(nn);
9927      IRExpr* argR = getQReg128(mm);
9928      IRTemp  res  = newTempV128();
9929      assign(res,
9930             isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
9931                  : binop(Iop_CmpGT64Ux2, argL, argR));
9932      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9933      DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
9934          nameQRegLO(dd, Ity_I64),
9935          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9936      return True;
9937   }
9938
9939   if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
9940      /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
9941      /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
9942      Bool    isGE = bitU == 0;
9943      IRExpr* argL = getQReg128(nn);
9944      IRExpr* argR = getQReg128(mm);
9945      IRTemp  res  = newTempV128();
9946      assign(res,
9947             isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
9948                  : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
9949      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9950      DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
9951          nameQRegLO(dd, Ity_I64),
9952          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9953      return True;
9954   }
9955
9956   if (size == X11 && (opcode == BITS5(0,1,0,0,0)
9957                       || opcode == BITS5(0,1,0,1,0))) {
9958      /* -------- 0,xx,01000 SSHL  d_d_d -------- */
9959      /* -------- 0,xx,01010 SRSHL d_d_d -------- */
9960      /* -------- 1,xx,01000 USHL  d_d_d -------- */
9961      /* -------- 1,xx,01010 URSHL d_d_d -------- */
9962      Bool isU = bitU == 1;
9963      Bool isR = opcode == BITS5(0,1,0,1,0);
9964      IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
9965                     : (isU ? mkVecSHU(size)  : mkVecSHS(size));
9966      IRTemp res = newTempV128();
9967      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
9968      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9969      const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
9970                             : (isU ? "ushl"  : "sshl");
9971      DIP("%s %s, %s, %s\n", nm,
9972          nameQRegLO(dd, Ity_I64),
9973          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9974      return True;
9975   }
9976
9977   if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
9978      /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
9979      /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
9980      /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
9981      /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
9982      Bool isU = bitU == 1;
9983      Bool isR = opcode == BITS5(0,1,0,1,1);
9984      IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
9985                     : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
9986      /* This is a bit tricky.  Since we're only interested in the lowest
9987         lane of the result, we zero out all the rest in the operands, so
9988         as to ensure that other lanes don't pollute the returned Q value.
9989         This works because it means, for the lanes we don't care about, we
9990         are shifting zero by zero, which can never saturate. */
9991      IRTemp res256 = newTemp(Ity_V256);
9992      IRTemp resSH  = newTempV128();
9993      IRTemp resQ   = newTempV128();
9994      IRTemp zero   = newTempV128();
9995      assign(
9996         res256,
9997         binop(op,
9998               mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
9999               mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10000      assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10001      assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10002      assign(zero,  mkV128(0x0000));
10003      putQReg128(dd, mkexpr(resSH));
10004      updateQCFLAGwithDifference(resQ, zero);
10005      const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10006                             : (isU ? "uqshl"  : "sqshl");
10007      const HChar  arr = "bhsd"[size];
10008      DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10009      return True;
10010   }
10011
10012   if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10013      /* -------- 0,11,10000 ADD d_d_d -------- */
10014      /* -------- 1,11,10000 SUB d_d_d -------- */
10015      Bool   isSUB = bitU == 1;
10016      IRTemp res   = newTemp(Ity_I64);
10017      assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10018                        getQRegLane(nn, 0, Ity_I64),
10019                        getQRegLane(mm, 0, Ity_I64)));
10020      putQRegLane(dd, 0, mkexpr(res));
10021      putQRegLane(dd, 1, mkU64(0));
10022      DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10023          nameQRegLO(dd, Ity_I64),
10024          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10025      return True;
10026   }
10027
10028   if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10029      /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10030      /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10031      Bool    isEQ = bitU == 1;
10032      IRExpr* argL = getQReg128(nn);
10033      IRExpr* argR = getQReg128(mm);
10034      IRTemp  res  = newTempV128();
10035      assign(res,
10036             isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10037                  : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10038                                            binop(Iop_AndV128, argL, argR),
10039                                            mkV128(0x0000))));
10040      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10041      DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10042          nameQRegLO(dd, Ity_I64),
10043          nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10044      return True;
10045   }
10046
10047   if (opcode == BITS5(1,0,1,1,0)) {
10048      /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10049      /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10050      if (size == X00 || size == X11) return False;
10051      Bool isR = bitU == 1;
10052      IRTemp res, sat1q, sat1n, vN, vM;
10053      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10054      newTempsV128_2(&vN, &vM);
10055      assign(vN, getQReg128(nn));
10056      assign(vM, getQReg128(mm));
10057      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10058      putQReg128(dd,
10059                 mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10060      updateQCFLAGwithDifference(
10061         math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10062         math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10063      const HChar  arr = "bhsd"[size];
10064      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10065      DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10066      return True;
10067   }
10068
10069   if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10070      /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10071      IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10072      IRTemp res = newTemp(ity);
10073      assign(res, unop(mkABSF(ity),
10074                       triop(mkSUBF(ity),
10075                             mkexpr(mk_get_IR_rounding_mode()),
10076                             getQRegLO(nn,ity), getQRegLO(mm,ity))));
10077      putQReg128(dd, mkV128(0x0000));
10078      putQRegLO(dd, mkexpr(res));
10079      DIP("fabd %s, %s, %s\n",
10080          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10081      return True;
10082   }
10083
10084   if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10085      /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10086      // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10087      IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10088      IRTemp res = newTemp(ity);
10089      assign(res, triop(mkMULF(ity),
10090                        mkexpr(mk_get_IR_rounding_mode()),
10091                        getQRegLO(nn,ity), getQRegLO(mm,ity)));
10092      putQReg128(dd, mkV128(0x0000));
10093      putQRegLO(dd, mkexpr(res));
10094      DIP("fmulx %s, %s, %s\n",
10095          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10096      return True;
10097   }
10098
10099   if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10100      /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10101      /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10102      Bool   isD   = size == X01;
10103      IRType ity   = isD ? Ity_F64 : Ity_F32;
10104      Bool   isGE  = bitU == 1;
10105      IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10106                          : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10107      IRTemp res   = newTempV128();
10108      assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10109                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10110      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10111                                                             mkexpr(res))));
10112      DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10113          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10114      return True;
10115   }
10116
10117   if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10118      /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10119      Bool   isD   = size == X11;
10120      IRType ity   = isD ? Ity_F64 : Ity_F32;
10121      IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10122      IRTemp res   = newTempV128();
10123      assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10124      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10125                                                             mkexpr(res))));
10126      DIP("%s %s, %s, %s\n", "fcmgt",
10127          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10128      return True;
10129   }
10130
10131   if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10132      /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10133      /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10134      Bool   isD   = (size & 1) == 1;
10135      IRType ity   = isD ? Ity_F64 : Ity_F32;
10136      Bool   isGT  = (size & 2) == 2;
10137      IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10138                          : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10139      IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10140      IRTemp res   = newTempV128();
10141      assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10142                               unop(opABS, getQReg128(nn)))); // swapd
10143      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10144                                                             mkexpr(res))));
10145      DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10146          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10147      return True;
10148   }
10149
10150   if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10151      /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10152      /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10153      Bool isSQRT = (size & 2) == 2;
10154      Bool isD    = (size & 1) == 1;
10155      IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10156                           : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10157      IRTemp res = newTempV128();
10158      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10159      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10160                                                             mkexpr(res))));
10161      HChar c = isD ? 'd' : 's';
10162      DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10163          c, dd, c, nn, c, mm);
10164      return True;
10165   }
10166
10167   return False;
10168#  undef INSN
10169}
10170
10171
10172static
10173Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
10174{
10175   /* 31 29 28    23   21    16     11 9 4
10176      01 U  11110 size 10000 opcode 10 n d
10177      Decode fields: u,size,opcode
10178   */
10179#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10180   if (INSN(31,30) != BITS2(0,1)
10181       || INSN(28,24) != BITS5(1,1,1,1,0)
10182       || INSN(21,17) != BITS5(1,0,0,0,0)
10183       || INSN(11,10) != BITS2(1,0)) {
10184      return False;
10185   }
10186   UInt bitU   = INSN(29,29);
10187   UInt size   = INSN(23,22);
10188   UInt opcode = INSN(16,12);
10189   UInt nn     = INSN(9,5);
10190   UInt dd     = INSN(4,0);
10191   vassert(size < 4);
10192
10193   if (opcode == BITS5(0,0,0,1,1)) {
10194      /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
10195      /* -------- 1,xx,00011: USQADD std4_std4 -------- */
10196      /* These are a bit tricky (to say the least).  See comments on
10197         the vector variants (in dis_AdvSIMD_two_reg_misc) below for
10198         details. */
10199      Bool   isUSQADD = bitU == 1;
10200      IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
10201                             : mkVecQADDEXTUSSATSS(size);
10202      IROp   nop  = mkVecADD(size);
10203      IRTemp argL = newTempV128();
10204      IRTemp argR = newTempV128();
10205      assign(argL, getQReg128(nn));
10206      assign(argR, getQReg128(dd));
10207      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10208                       size, binop(qop, mkexpr(argL), mkexpr(argR)));
10209      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10210                       size, binop(nop, mkexpr(argL), mkexpr(argR)));
10211      putQReg128(dd, mkexpr(qres));
10212      updateQCFLAGwithDifference(qres, nres);
10213      const HChar arr = "bhsd"[size];
10214      DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
10215      return True;
10216   }
10217
10218   if (opcode == BITS5(0,0,1,1,1)) {
10219      /* -------- 0,xx,00111 SQABS std4_std4 -------- */
10220      /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
10221      Bool isNEG = bitU == 1;
10222      IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
10223      (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
10224                                         getQReg128(nn), size );
10225      IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
10226      IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
10227      putQReg128(dd, mkexpr(qres));
10228      updateQCFLAGwithDifference(qres, nres);
10229      const HChar arr = "bhsd"[size];
10230      DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
10231      return True;
10232   }
10233
10234   if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
10235      /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
10236      /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
10237      Bool    isGT = bitU == 0;
10238      IRExpr* argL = getQReg128(nn);
10239      IRExpr* argR = mkV128(0x0000);
10240      IRTemp  res  = newTempV128();
10241      assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10242                       : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
10243      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10244      DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
10245      return True;
10246   }
10247
10248   if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
10249      /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
10250      /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
10251      Bool    isEQ = bitU == 0;
10252      IRExpr* argL = getQReg128(nn);
10253      IRExpr* argR = mkV128(0x0000);
10254      IRTemp  res  = newTempV128();
10255      assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10256                       : unop(Iop_NotV128,
10257                              binop(Iop_CmpGT64Sx2, argL, argR)));
10258      putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10259      DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
10260      return True;
10261   }
10262
10263   if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
10264      /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
10265      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10266                          binop(Iop_CmpGT64Sx2, mkV128(0x0000),
10267                                                getQReg128(nn))));
10268      DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
10269      return True;
10270   }
10271
10272   if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10273      /* -------- 0,11,01011 ABS d_d -------- */
10274      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10275                          unop(Iop_Abs64x2, getQReg128(nn))));
10276      DIP("abs d%u, d%u\n", dd, nn);
10277      return True;
10278   }
10279
10280   if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10281      /* -------- 1,11,01011 NEG d_d -------- */
10282      putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10283                          binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
10284      DIP("neg d%u, d%u\n", dd, nn);
10285      return True;
10286   }
10287
10288   UInt ix = 0; /*INVALID*/
10289   if (size >= X10) {
10290      switch (opcode) {
10291         case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
10292         case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
10293         case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
10294         default: break;
10295      }
10296   }
10297   if (ix > 0) {
10298      /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
10299      /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
10300      /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
10301      /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
10302      /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
10303      Bool   isD     = size == X11;
10304      IRType ity     = isD ? Ity_F64 : Ity_F32;
10305      IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
10306      IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
10307      IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10308      IROp   opCmp   = Iop_INVALID;
10309      Bool   swap    = False;
10310      const HChar* nm = "??";
10311      switch (ix) {
10312         case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
10313         case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
10314         case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
10315         case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
10316         case 5: nm = "fcmle"; opCmp = opCmpLE; break;
10317         default: vassert(0);
10318      }
10319      IRExpr* zero = mkV128(0x0000);
10320      IRTemp res = newTempV128();
10321      assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
10322                       : binop(opCmp, getQReg128(nn), zero));
10323      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10324                                                             mkexpr(res))));
10325
10326      DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
10327      return True;
10328   }
10329
10330   if (opcode == BITS5(1,0,1,0,0)
10331       || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
10332      /* -------- 0,xx,10100: SQXTN -------- */
10333      /* -------- 1,xx,10100: UQXTN -------- */
10334      /* -------- 1,xx,10010: SQXTUN -------- */
10335      if (size == X11) return False;
10336      vassert(size < 3);
10337      IROp  opN    = Iop_INVALID;
10338      Bool  zWiden = True;
10339      const HChar* nm = "??";
10340      /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
10341         opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
10342      }
10343      else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
10344         opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
10345      }
10346      else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10347         opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
10348      }
10349      else vassert(0);
10350      IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10351                       size+1, getQReg128(nn));
10352      IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10353                       size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
10354      putQReg128(dd, mkexpr(resN));
10355      /* This widens zero lanes to zero, and compares it against zero, so all
10356         of the non-participating lanes make no contribution to the
10357         Q flag state. */
10358      IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
10359                                              size, mkexpr(resN));
10360      updateQCFLAGwithDifference(src, resW);
10361      const HChar arrNarrow = "bhsd"[size];
10362      const HChar arrWide   = "bhsd"[size+1];
10363      DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
10364      return True;
10365   }
10366
10367   if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
10368      /* -------- 1,01,10110 FCVTXN s_d -------- */
10369      /* Using Irrm_NEAREST here isn't right.  The docs say "round to
10370         odd" but I don't know what that really means. */
10371      putQRegLO(dd,
10372                binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
10373                                    getQRegLO(nn, Ity_F64)));
10374      putQRegLane(dd, 1, mkU32(0));
10375      putQRegLane(dd, 1, mkU64(0));
10376      DIP("fcvtxn s%u, d%u\n", dd, nn);
10377      return True;
10378   }
10379
10380   ix = 0; /*INVALID*/
10381   switch (opcode) {
10382      case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
10383      case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
10384      case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
10385      default: break;
10386   }
10387   if (ix > 0) {
10388      /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10389      /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10390      /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10391      /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10392      /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10393      /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10394      /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10395      /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10396      /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10397      /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10398      Bool           isD  = (size & 1) == 1;
10399      IRType         tyF  = isD ? Ity_F64 : Ity_F32;
10400      IRType         tyI  = isD ? Ity_I64 : Ity_I32;
10401      IRRoundingMode irrm = 8; /*impossible*/
10402      HChar          ch   = '?';
10403      switch (ix) {
10404         case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
10405         case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
10406         case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
10407         case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
10408         case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
10409         default: vassert(0);
10410      }
10411      IROp cvt = Iop_INVALID;
10412      if (bitU == 1) {
10413         cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
10414      } else {
10415         cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
10416      }
10417      IRTemp src = newTemp(tyF);
10418      IRTemp res = newTemp(tyI);
10419      assign(src, getQRegLane(nn, 0, tyF));
10420      assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
10421      putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
10422      if (!isD) {
10423         putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10424      }
10425      putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10426      HChar sOrD = isD ? 'd' : 's';
10427      DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
10428          sOrD, dd, sOrD, nn);
10429      return True;
10430   }
10431
10432   if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
10433      /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
10434      /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
10435      Bool   isU = bitU == 1;
10436      Bool   isD = (size & 1) == 1;
10437      IRType tyI = isD ? Ity_I64 : Ity_I32;
10438      IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10439                       : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10440      IRTemp rm  = mk_get_IR_rounding_mode();
10441      putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
10442      if (!isD) {
10443         putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10444      }
10445      putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10446      HChar c = isD ? 'd' : 's';
10447      DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
10448      return True;
10449   }
10450
10451   if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
10452      /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
10453      /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
10454      Bool isSQRT = bitU == 1;
10455      Bool isD    = (size & 1) == 1;
10456      IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
10457                           : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
10458      IRTemp resV = newTempV128();
10459      assign(resV, unop(op, getQReg128(nn)));
10460      putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10461                                                             mkexpr(resV))));
10462      HChar c = isD ? 'd' : 's';
10463      DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
10464      return True;
10465   }
10466
10467   if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
10468      /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
10469      Bool   isD = (size & 1) == 1;
10470      IRType ty  = isD ? Ity_F64 : Ity_F32;
10471      IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
10472      IRTemp res = newTemp(ty);
10473      IRTemp rm  = mk_get_IR_rounding_mode();
10474      assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
10475      putQReg128(dd, mkV128(0x0000));
10476      putQRegLane(dd, 0, mkexpr(res));
10477      HChar c = isD ? 'd' : 's';
10478      DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
10479      return True;
10480   }
10481
10482   return False;
10483#  undef INSN
10484}
10485
10486
10487static
10488Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
10489{
10490   /* 31   28    23   21 20 19 15     11   9 4
10491      01 U 11111 size L  M  m  opcode H  0 n d
10492      Decode fields are: u,size,opcode
10493      M is really part of the mm register number.  Individual
10494      cases need to inspect L and H though.
10495   */
10496#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10497   if (INSN(31,30) != BITS2(0,1)
10498       || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) !=0) {
10499      return False;
10500   }
10501   UInt bitU   = INSN(29,29);
10502   UInt size   = INSN(23,22);
10503   UInt bitL   = INSN(21,21);
10504   UInt bitM   = INSN(20,20);
10505   UInt mmLO4  = INSN(19,16);
10506   UInt opcode = INSN(15,12);
10507   UInt bitH   = INSN(11,11);
10508   UInt nn     = INSN(9,5);
10509   UInt dd     = INSN(4,0);
10510   vassert(size < 4);
10511   vassert(bitH < 2 && bitM < 2 && bitL < 2);
10512
10513   if (bitU == 0 && size >= X10
10514       && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
10515      /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
10516      /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
10517      Bool isD   = (size & 1) == 1;
10518      Bool isSUB = opcode == BITS4(0,1,0,1);
10519      UInt index;
10520      if      (!isD)             index = (bitH << 1) | bitL;
10521      else if (isD && bitL == 0) index = bitH;
10522      else return False; // sz:L == x11 => unallocated encoding
10523      vassert(index < (isD ? 2 : 4));
10524      IRType ity   = isD ? Ity_F64 : Ity_F32;
10525      IRTemp elem  = newTemp(ity);
10526      UInt   mm    = (bitM << 4) | mmLO4;
10527      assign(elem, getQRegLane(mm, index, ity));
10528      IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10529      IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
10530      IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
10531      IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10532      IRTemp rm    = mk_get_IR_rounding_mode();
10533      IRTemp t1    = newTempV128();
10534      IRTemp t2    = newTempV128();
10535      // FIXME: double rounding; use FMA primops instead
10536      assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10537      assign(t2, triop(isSUB ? opSUB : opADD,
10538                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
10539      putQReg128(dd,
10540                 mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10541                                                         mkexpr(t2))));
10542      const HChar c = isD ? 'd' : 's';
10543      DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
10544          c, dd, c, nn, nameQReg128(mm), c, index);
10545      return True;
10546   }
10547
10548   if (size >= X10 && opcode == BITS4(1,0,0,1)) {
10549      /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
10550      /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
10551      Bool isD    = (size & 1) == 1;
10552      Bool isMULX = bitU == 1;
10553      UInt index;
10554      if      (!isD)             index = (bitH << 1) | bitL;
10555      else if (isD && bitL == 0) index = bitH;
10556      else return False; // sz:L == x11 => unallocated encoding
10557      vassert(index < (isD ? 2 : 4));
10558      IRType ity   = isD ? Ity_F64 : Ity_F32;
10559      IRTemp elem  = newTemp(ity);
10560      UInt   mm    = (bitM << 4) | mmLO4;
10561      assign(elem, getQRegLane(mm, index, ity));
10562      IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10563      IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10564      IRTemp rm    = mk_get_IR_rounding_mode();
10565      IRTemp t1    = newTempV128();
10566      // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10567      assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10568      putQReg128(dd,
10569                 mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10570                                                         mkexpr(t1))));
10571      const HChar c = isD ? 'd' : 's';
10572      DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
10573          c, dd, c, nn, nameQReg128(mm), c, index);
10574      return True;
10575   }
10576
10577   if (bitU == 0
10578       && (opcode == BITS4(1,0,1,1)
10579           || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
10580      /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
10581      /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
10582      /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
10583      /* Widens, and size refers to the narrowed lanes. */
10584      UInt ks = 3;
10585      switch (opcode) {
10586         case BITS4(1,0,1,1): ks = 0; break;
10587         case BITS4(0,0,1,1): ks = 1; break;
10588         case BITS4(0,1,1,1): ks = 2; break;
10589         default: vassert(0);
10590      }
10591      vassert(ks >= 0 && ks <= 2);
10592      UInt mm  = 32; // invalid
10593      UInt ix  = 16; // invalid
10594      switch (size) {
10595         case X00:
10596            return False; // h_b_b[] case is not allowed
10597         case X01:
10598            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10599         case X10:
10600            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10601         case X11:
10602            return False; // q_d_d[] case is not allowed
10603         default:
10604            vassert(0);
10605      }
10606      vassert(mm < 32 && ix < 16);
10607      IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
10608      vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10609      newTempsV128_2(&vecN, &vecD);
10610      assign(vecN, getQReg128(nn));
10611      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10612      assign(vecD, getQReg128(dd));
10613      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10614                       False/*!is2*/, size, "mas"[ks],
10615                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10616      IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10617      putQReg128(dd, unop(opZHI, mkexpr(res)));
10618      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10619      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10620      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10621         updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10622      }
10623      const HChar* nm        = ks == 0 ? "sqmull"
10624                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10625      const HChar  arrNarrow = "bhsd"[size];
10626      const HChar  arrWide   = "bhsd"[size+1];
10627      DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
10628          nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
10629      return True;
10630   }
10631
10632   if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
10633      /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
10634      /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
10635      UInt mm  = 32; // invalid
10636      UInt ix  = 16; // invalid
10637      switch (size) {
10638         case X00:
10639            return False; // b case is not allowed
10640         case X01:
10641            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10642         case X10:
10643            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10644         case X11:
10645            return False; // q case is not allowed
10646         default:
10647            vassert(0);
10648      }
10649      vassert(mm < 32 && ix < 16);
10650      Bool isR = opcode == BITS4(1,1,0,1);
10651      IRTemp res, sat1q, sat1n, vN, vM;
10652      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10653      vN = newTempV128();
10654      assign(vN, getQReg128(nn));
10655      vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10656      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10657      IROp opZHI = mkVecZEROHIxxOFV128(size);
10658      putQReg128(dd, unop(opZHI, mkexpr(res)));
10659      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10660      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10661      HChar ch         = size == X01 ? 'h' : 's';
10662      DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
10663      return True;
10664   }
10665
10666   return False;
10667#  undef INSN
10668}
10669
10670
10671static
10672Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
10673{
10674   /* 31    28     22   18   15     10 9 4
10675      0 q u 011110 immh immb opcode 1  n d
10676      Decode fields: u,opcode
10677   */
10678#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10679   if (INSN(31,31) != 0
10680       || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
10681      return False;
10682   }
10683   UInt bitQ   = INSN(30,30);
10684   UInt bitU   = INSN(29,29);
10685   UInt immh   = INSN(22,19);
10686   UInt immb   = INSN(18,16);
10687   UInt opcode = INSN(15,11);
10688   UInt nn     = INSN(9,5);
10689   UInt dd     = INSN(4,0);
10690
10691   if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
10692      /* -------- 0,00000 SSHR std7_std7_#imm -------- */
10693      /* -------- 1,00000 USHR std7_std7_#imm -------- */
10694      /* -------- 0,00010 SSRA std7_std7_#imm -------- */
10695      /* -------- 1,00010 USRA std7_std7_#imm -------- */
10696      /* laneTy, shift = case immh:immb of
10697                         0001:xxx -> B, SHR:8-xxx
10698                         001x:xxx -> H, SHR:16-xxxx
10699                         01xx:xxx -> S, SHR:32-xxxxx
10700                         1xxx:xxx -> D, SHR:64-xxxxxx
10701                         other    -> invalid
10702      */
10703      UInt size  = 0;
10704      UInt shift = 0;
10705      Bool isQ   = bitQ == 1;
10706      Bool isU   = bitU == 1;
10707      Bool isAcc = opcode == BITS5(0,0,0,1,0);
10708      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10709      if (!ok || (bitQ == 0 && size == X11)) return False;
10710      vassert(size >= 0 && size <= 3);
10711      UInt lanebits = 8 << size;
10712      vassert(shift >= 1 && shift <= lanebits);
10713      IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
10714      IRExpr* src = getQReg128(nn);
10715      IRTemp  shf = newTempV128();
10716      IRTemp  res = newTempV128();
10717      if (shift == lanebits && isU) {
10718         assign(shf, mkV128(0x0000));
10719      } else {
10720         UInt nudge = 0;
10721         if (shift == lanebits) {
10722            vassert(!isU);
10723            nudge = 1;
10724         }
10725         assign(shf, binop(op, src, mkU8(shift - nudge)));
10726      }
10727      assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
10728                        : mkexpr(shf));
10729      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10730      HChar laneCh = "bhsd"[size];
10731      UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10732      const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
10733                              : (isU ? "ushr" : "sshr");
10734      DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10735          nameQReg128(dd), nLanes, laneCh,
10736          nameQReg128(nn), nLanes, laneCh, shift);
10737      return True;
10738   }
10739
10740   if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
10741      /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
10742      /* -------- 1,00100 URSHR std7_std7_#imm -------- */
10743      /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
10744      /* -------- 1,00110 URSRA std7_std7_#imm -------- */
10745      /* laneTy, shift = case immh:immb of
10746                         0001:xxx -> B, SHR:8-xxx
10747                         001x:xxx -> H, SHR:16-xxxx
10748                         01xx:xxx -> S, SHR:32-xxxxx
10749                         1xxx:xxx -> D, SHR:64-xxxxxx
10750                         other    -> invalid
10751      */
10752      UInt size  = 0;
10753      UInt shift = 0;
10754      Bool isQ   = bitQ == 1;
10755      Bool isU   = bitU == 1;
10756      Bool isAcc = opcode == BITS5(0,0,1,1,0);
10757      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10758      if (!ok || (bitQ == 0 && size == X11)) return False;
10759      vassert(size >= 0 && size <= 3);
10760      UInt lanebits = 8 << size;
10761      vassert(shift >= 1 && shift <= lanebits);
10762      IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
10763      IRExpr* src  = getQReg128(nn);
10764      IRTemp  imm8 = newTemp(Ity_I8);
10765      assign(imm8, mkU8((UChar)(-shift)));
10766      IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
10767      IRTemp  shf  = newTempV128();
10768      IRTemp  res  = newTempV128();
10769      assign(shf, binop(op, src, amt));
10770      assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
10771                        : mkexpr(shf));
10772      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10773      HChar laneCh = "bhsd"[size];
10774      UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10775      const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
10776                              : (isU ? "urshr" : "srshr");
10777      DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10778          nameQReg128(dd), nLanes, laneCh,
10779          nameQReg128(nn), nLanes, laneCh, shift);
10780      return True;
10781   }
10782
10783   if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
10784      /* -------- 1,01000 SRI std7_std7_#imm -------- */
10785      /* laneTy, shift = case immh:immb of
10786                         0001:xxx -> B, SHR:8-xxx
10787                         001x:xxx -> H, SHR:16-xxxx
10788                         01xx:xxx -> S, SHR:32-xxxxx
10789                         1xxx:xxx -> D, SHR:64-xxxxxx
10790                         other    -> invalid
10791      */
10792      UInt size  = 0;
10793      UInt shift = 0;
10794      Bool isQ   = bitQ == 1;
10795      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10796      if (!ok || (bitQ == 0 && size == X11)) return False;
10797      vassert(size >= 0 && size <= 3);
10798      UInt lanebits = 8 << size;
10799      vassert(shift >= 1 && shift <= lanebits);
10800      IRExpr* src = getQReg128(nn);
10801      IRTemp  res = newTempV128();
10802      if (shift == lanebits) {
10803         assign(res, getQReg128(dd));
10804      } else {
10805         assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
10806         IRExpr* nmask = binop(mkVecSHLN(size),
10807                               mkV128(0xFFFF), mkU8(lanebits - shift));
10808         IRTemp  tmp   = newTempV128();
10809         assign(tmp, binop(Iop_OrV128,
10810                           mkexpr(res),
10811                           binop(Iop_AndV128, getQReg128(dd), nmask)));
10812         res = tmp;
10813      }
10814      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10815      HChar laneCh = "bhsd"[size];
10816      UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10817      DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
10818          nameQReg128(dd), nLanes, laneCh,
10819          nameQReg128(nn), nLanes, laneCh, shift);
10820      return True;
10821   }
10822
10823   if (opcode == BITS5(0,1,0,1,0)) {
10824      /* -------- 0,01010 SHL std7_std7_#imm -------- */
10825      /* -------- 1,01010 SLI std7_std7_#imm -------- */
10826      /* laneTy, shift = case immh:immb of
10827                         0001:xxx -> B, xxx
10828                         001x:xxx -> H, xxxx
10829                         01xx:xxx -> S, xxxxx
10830                         1xxx:xxx -> D, xxxxxx
10831                         other    -> invalid
10832      */
10833      UInt size  = 0;
10834      UInt shift = 0;
10835      Bool isSLI = bitU == 1;
10836      Bool isQ   = bitQ == 1;
10837      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10838      if (!ok || (bitQ == 0 && size == X11)) return False;
10839      vassert(size >= 0 && size <= 3);
10840      /* The shift encoding has opposite sign for the leftwards case.
10841         Adjust shift to compensate. */
10842      UInt lanebits = 8 << size;
10843      shift = lanebits - shift;
10844      vassert(shift >= 0 && shift < lanebits);
10845      IROp    op  = mkVecSHLN(size);
10846      IRExpr* src = getQReg128(nn);
10847      IRTemp  res = newTempV128();
10848      if (shift == 0) {
10849         assign(res, src);
10850      } else {
10851         assign(res, binop(op, src, mkU8(shift)));
10852         if (isSLI) {
10853            IRExpr* nmask = binop(mkVecSHRN(size),
10854                                  mkV128(0xFFFF), mkU8(lanebits - shift));
10855            IRTemp  tmp   = newTempV128();
10856            assign(tmp, binop(Iop_OrV128,
10857                              mkexpr(res),
10858                              binop(Iop_AndV128, getQReg128(dd), nmask)));
10859            res = tmp;
10860         }
10861      }
10862      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10863      HChar laneCh = "bhsd"[size];
10864      UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10865      const HChar* nm = isSLI ? "sli" : "shl";
10866      DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10867          nameQReg128(dd), nLanes, laneCh,
10868          nameQReg128(nn), nLanes, laneCh, shift);
10869      return True;
10870   }
10871
10872   if (opcode == BITS5(0,1,1,1,0)
10873       || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
10874      /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
10875      /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
10876      /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
10877      UInt size  = 0;
10878      UInt shift = 0;
10879      Bool isQ   = bitQ == 1;
10880      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10881      if (!ok || (bitQ == 0 && size == X11)) return False;
10882      vassert(size >= 0 && size <= 3);
10883      /* The shift encoding has opposite sign for the leftwards case.
10884         Adjust shift to compensate. */
10885      UInt lanebits = 8 << size;
10886      shift = lanebits - shift;
10887      vassert(shift >= 0 && shift < lanebits);
10888      const HChar* nm = NULL;
10889      /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
10890      else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
10891      else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
10892      else vassert(0);
10893      IRTemp qDiff1 = IRTemp_INVALID;
10894      IRTemp qDiff2 = IRTemp_INVALID;
10895      IRTemp res = IRTemp_INVALID;
10896      IRTemp src = newTempV128();
10897      assign(src, getQReg128(nn));
10898      math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
10899      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10900      updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
10901                                    isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
10902      const HChar* arr = nameArr_Q_SZ(bitQ, size);
10903      DIP("%s %s.%s, %s.%s, #%u\n", nm,
10904          nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
10905      return True;
10906   }
10907
10908   if (bitU == 0
10909       && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10910      /* -------- 0,10000  SHRN{,2} #imm -------- */
10911      /* -------- 0,10001 RSHRN{,2} #imm -------- */
10912      /* Narrows, and size is the narrow size. */
10913      UInt size  = 0;
10914      UInt shift = 0;
10915      Bool is2   = bitQ == 1;
10916      Bool isR   = opcode == BITS5(1,0,0,0,1);
10917      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10918      if (!ok || size == X11) return False;
10919      vassert(shift >= 1);
10920      IRTemp t1 = newTempV128();
10921      IRTemp t2 = newTempV128();
10922      IRTemp t3 = newTempV128();
10923      assign(t1, getQReg128(nn));
10924      assign(t2, isR ? binop(mkVecADD(size+1),
10925                             mkexpr(t1),
10926                             mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
10927                     : mkexpr(t1));
10928      assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
10929      IRTemp t4 = math_NARROW_LANES(t3, t3, size);
10930      putLO64andZUorPutHI64(is2, dd, t4);
10931      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
10932      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
10933      DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
10934          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
10935      return True;
10936   }
10937
10938   if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
10939       || (bitU == 1
10940           && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
10941      /* -------- 0,10010   SQSHRN{,2} #imm -------- */
10942      /* -------- 1,10010   UQSHRN{,2} #imm -------- */
10943      /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
10944      /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
10945      /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
10946      /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
10947      UInt size  = 0;
10948      UInt shift = 0;
10949      Bool is2   = bitQ == 1;
10950      Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10951      if (!ok || size == X11) return False;
10952      vassert(shift >= 1 && shift <= (8 << size));
10953      const HChar* nm = "??";
10954      IROp op = Iop_INVALID;
10955      /* Decide on the name and the operation. */
10956      /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
10957         nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
10958      }
10959      else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10960         nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
10961      }
10962      else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
10963         nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
10964      }
10965      else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
10966         nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
10967      }
10968      else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
10969         nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
10970      }
10971      else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
10972         nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
10973      }
10974      else vassert(0);
10975      /* Compute the result (Q, shifted value) pair. */
10976      IRTemp src128 = newTempV128();
10977      assign(src128, getQReg128(nn));
10978      IRTemp pair = newTempV128();
10979      assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
10980      /* Update the result reg */
10981      IRTemp res64in128 = newTempV128();
10982      assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
10983      putLO64andZUorPutHI64(is2, dd, res64in128);
10984      /* Update the Q flag. */
10985      IRTemp q64q64 = newTempV128();
10986      assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
10987      IRTemp z128 = newTempV128();
10988      assign(z128, mkV128(0x0000));
10989      updateQCFLAGwithDifference(q64q64, z128);
10990      /* */
10991      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
10992      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
10993      DIP("%s %s.%s, %s.%s, #%u\n", nm,
10994          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
10995      return True;
10996   }
10997
10998   if (opcode == BITS5(1,0,1,0,0)) {
10999      /* -------- 0,10100 SSHLL{,2} #imm -------- */
11000      /* -------- 1,10100 USHLL{,2} #imm -------- */
11001      /* 31  28     22   18   15     9 4
11002         0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11003         0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11004         where Ta,Tb,sh
11005           = case immh of 1xxx -> invalid
11006                          01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11007                          001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11008                          0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11009                          0000 -> AdvSIMD modified immediate (???)
11010      */
11011      Bool    isQ   = bitQ == 1;
11012      Bool    isU   = bitU == 1;
11013      UInt    immhb = (immh << 3) | immb;
11014      IRTemp  src   = newTempV128();
11015      IRTemp  zero  = newTempV128();
11016      IRExpr* res   = NULL;
11017      UInt    sh    = 0;
11018      const HChar* ta = "??";
11019      const HChar* tb = "??";
11020      assign(src, getQReg128(nn));
11021      assign(zero, mkV128(0x0000));
11022      if (immh & 8) {
11023         /* invalid; don't assign to res */
11024      }
11025      else if (immh & 4) {
11026         sh = immhb - 32;
11027         vassert(sh < 32); /* so 32-sh is 1..32 */
11028         ta = "2d";
11029         tb = isQ ? "4s" : "2s";
11030         IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
11031                           : mk_InterleaveLO32x4(src, zero);
11032         res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
11033      }
11034      else if (immh & 2) {
11035         sh = immhb - 16;
11036         vassert(sh < 16); /* so 16-sh is 1..16 */
11037         ta = "4s";
11038         tb = isQ ? "8h" : "4h";
11039         IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
11040                           : mk_InterleaveLO16x8(src, zero);
11041         res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
11042      }
11043      else if (immh & 1) {
11044         sh = immhb - 8;
11045         vassert(sh < 8); /* so 8-sh is 1..8 */
11046         ta = "8h";
11047         tb = isQ ? "16b" : "8b";
11048         IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
11049                           : mk_InterleaveLO8x16(src, zero);
11050         res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
11051      } else {
11052         vassert(immh == 0);
11053         /* invalid; don't assign to res */
11054      }
11055      /* */
11056      if (res) {
11057         putQReg128(dd, res);
11058         DIP("%cshll%s %s.%s, %s.%s, #%u\n",
11059             isU ? 'u' : 's', isQ ? "2" : "",
11060             nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
11061         return True;
11062      }
11063      return False;
11064   }
11065
11066   if (opcode == BITS5(1,1,1,0,0)) {
11067      /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11068      /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11069      /* If immh is of the form 00xx, the insn is invalid. */
11070      if (immh < BITS4(0,1,0,0)) return False;
11071      UInt size  = 0;
11072      UInt fbits = 0;
11073      Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11074      /* The following holds because immh is never zero. */
11075      vassert(ok);
11076      /* The following holds because immh >= 0100. */
11077      vassert(size == X10 || size == X11);
11078      Bool isD = size == X11;
11079      Bool isU = bitU == 1;
11080      Bool isQ = bitQ == 1;
11081      if (isD && !isQ) return False; /* reject .1d case */
11082      vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11083      Double  scale  = two_to_the_minus(fbits);
11084      IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11085                           : IRExpr_Const(IRConst_F32( (Float)scale ));
11086      IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11087      IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11088                           : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11089      IRType tyF = isD ? Ity_F64 : Ity_F32;
11090      IRType tyI = isD ? Ity_I64 : Ity_I32;
11091      UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11092      vassert(nLanes == 2 || nLanes == 4);
11093      for (UInt i = 0; i < nLanes; i++) {
11094         IRTemp src = newTemp(tyI);
11095         IRTemp res = newTemp(tyF);
11096         IRTemp rm  = mk_get_IR_rounding_mode();
11097         assign(src, getQRegLane(nn, i, tyI));
11098         assign(res, triop(opMUL, mkexpr(rm),
11099                                  binop(opCVT, mkexpr(rm), mkexpr(src)),
11100                                  scaleE));
11101         putQRegLane(dd, i, mkexpr(res));
11102      }
11103      if (!isQ) {
11104         putQRegLane(dd, 1, mkU64(0));
11105      }
11106      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11107      DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
11108          nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11109      return True;
11110   }
11111
11112   if (opcode == BITS5(1,1,1,1,1)) {
11113      /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
11114      /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
11115      /* If immh is of the form 00xx, the insn is invalid. */
11116      if (immh < BITS4(0,1,0,0)) return False;
11117      UInt size  = 0;
11118      UInt fbits = 0;
11119      Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11120      /* The following holds because immh is never zero. */
11121      vassert(ok);
11122      /* The following holds because immh >= 0100. */
11123      vassert(size == X10 || size == X11);
11124      Bool isD = size == X11;
11125      Bool isU = bitU == 1;
11126      Bool isQ = bitQ == 1;
11127      if (isD && !isQ) return False; /* reject .1d case */
11128      vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11129      Double  scale  = two_to_the_plus(fbits);
11130      IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11131                           : IRExpr_Const(IRConst_F32( (Float)scale ));
11132      IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11133      IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
11134                           : (isD ? Iop_F64toI64S : Iop_F32toI32S);
11135      IRType tyF = isD ? Ity_F64 : Ity_F32;
11136      IRType tyI = isD ? Ity_I64 : Ity_I32;
11137      UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11138      vassert(nLanes == 2 || nLanes == 4);
11139      for (UInt i = 0; i < nLanes; i++) {
11140         IRTemp src = newTemp(tyF);
11141         IRTemp res = newTemp(tyI);
11142         IRTemp rm  = newTemp(Ity_I32);
11143         assign(src, getQRegLane(nn, i, tyF));
11144         assign(rm,  mkU32(Irrm_ZERO));
11145         assign(res, binop(opCVT, mkexpr(rm),
11146                                  triop(opMUL, mkexpr(rm),
11147                                               mkexpr(src), scaleE)));
11148         putQRegLane(dd, i, mkexpr(res));
11149      }
11150      if (!isQ) {
11151         putQRegLane(dd, 1, mkU64(0));
11152      }
11153      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11154      DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
11155          nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11156      return True;
11157   }
11158
11159#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11160   return False;
11161#  undef INSN
11162}
11163
11164
11165static
11166Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
11167{
11168   /* 31 30 29 28    23   21 20 15     11 9 4
11169      0  Q  U  01110 size 1  m  opcode 00 n d
11170      Decode fields: u,opcode
11171   */
11172#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11173   if (INSN(31,31) != 0
11174       || INSN(28,24) != BITS5(0,1,1,1,0)
11175       || INSN(21,21) != 1
11176       || INSN(11,10) != BITS2(0,0)) {
11177      return False;
11178   }
11179   UInt bitQ   = INSN(30,30);
11180   UInt bitU   = INSN(29,29);
11181   UInt size   = INSN(23,22);
11182   UInt mm     = INSN(20,16);
11183   UInt opcode = INSN(15,12);
11184   UInt nn     = INSN(9,5);
11185   UInt dd     = INSN(4,0);
11186   vassert(size < 4);
11187   Bool is2    = bitQ == 1;
11188
11189   if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
11190      /* -------- 0,0000 SADDL{2} -------- */
11191      /* -------- 1,0000 UADDL{2} -------- */
11192      /* -------- 0,0010 SSUBL{2} -------- */
11193      /* -------- 1,0010 USUBL{2} -------- */
11194      /* Widens, and size refers to the narrow lanes. */
11195      if (size == X11) return False;
11196      vassert(size <= 2);
11197      Bool   isU   = bitU == 1;
11198      Bool   isADD = opcode == BITS4(0,0,0,0);
11199      IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11200      IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11201      IRTemp res   = newTempV128();
11202      assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11203                        mkexpr(argL), mkexpr(argR)));
11204      putQReg128(dd, mkexpr(res));
11205      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11206      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11207      const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
11208                                     : (isU ? "usubl" : "ssubl");
11209      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11210          nameQReg128(dd), arrWide,
11211          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11212      return True;
11213   }
11214
11215   if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
11216      /* -------- 0,0001 SADDW{2} -------- */
11217      /* -------- 1,0001 UADDW{2} -------- */
11218      /* -------- 0,0011 SSUBW{2} -------- */
11219      /* -------- 1,0011 USUBW{2} -------- */
11220      /* Widens, and size refers to the narrow lanes. */
11221      if (size == X11) return False;
11222      vassert(size <= 2);
11223      Bool   isU   = bitU == 1;
11224      Bool   isADD = opcode == BITS4(0,0,0,1);
11225      IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11226      IRTemp res   = newTempV128();
11227      assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11228                        getQReg128(nn), mkexpr(argR)));
11229      putQReg128(dd, mkexpr(res));
11230      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11231      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11232      const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
11233                                     : (isU ? "usubw" : "ssubw");
11234      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11235          nameQReg128(dd), arrWide,
11236          nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
11237      return True;
11238   }
11239
11240   if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
11241      /* -------- 0,0100  ADDHN{2} -------- */
11242      /* -------- 1,0100 RADDHN{2} -------- */
11243      /* -------- 0,0110  SUBHN{2} -------- */
11244      /* -------- 1,0110 RSUBHN{2} -------- */
11245      /* Narrows, and size refers to the narrowed lanes. */
11246      if (size == X11) return False;
11247      vassert(size <= 2);
11248      const UInt shift[3] = { 8, 16, 32 };
11249      Bool isADD = opcode == BITS4(0,1,0,0);
11250      Bool isR   = bitU == 1;
11251      /* Combined elements in wide lanes */
11252      IRTemp  wide  = newTempV128();
11253      IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11254                            getQReg128(nn), getQReg128(mm));
11255      if (isR) {
11256         wideE = binop(mkVecADD(size+1),
11257                       wideE,
11258                       mkexpr(math_VEC_DUP_IMM(size+1,
11259                                               1ULL << (shift[size]-1))));
11260      }
11261      assign(wide, wideE);
11262      /* Top halves of elements, still in wide lanes */
11263      IRTemp shrd = newTempV128();
11264      assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
11265      /* Elements now compacted into lower 64 bits */
11266      IRTemp new64 = newTempV128();
11267      assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
11268      putLO64andZUorPutHI64(is2, dd, new64);
11269      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11270      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11271      const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
11272                              : (isR ? "rsubhn" : "subhn");
11273      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11274          nameQReg128(dd), arrNarrow,
11275          nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
11276      return True;
11277   }
11278
11279   if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
11280      /* -------- 0,0101 SABAL{2} -------- */
11281      /* -------- 1,0101 UABAL{2} -------- */
11282      /* -------- 0,0111 SABDL{2} -------- */
11283      /* -------- 1,0111 UABDL{2} -------- */
11284      /* Widens, and size refers to the narrow lanes. */
11285      if (size == X11) return False;
11286      vassert(size <= 2);
11287      Bool   isU   = bitU == 1;
11288      Bool   isACC = opcode == BITS4(0,1,0,1);
11289      IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11290      IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11291      IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
11292      IRTemp res   = newTempV128();
11293      assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
11294                        : mkexpr(abd));
11295      putQReg128(dd, mkexpr(res));
11296      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11297      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11298      const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
11299                                     : (isU ? "uabdl" : "sabdl");
11300      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11301          nameQReg128(dd), arrWide,
11302          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11303      return True;
11304   }
11305
11306   if (opcode == BITS4(1,1,0,0)
11307       || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
11308      /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
11309      /* -------- 1,1100  UMULL{2} -------- */ // 0
11310      /* -------- 0,1000  SMLAL{2} -------- */ // 1
11311      /* -------- 1,1000  UMLAL{2} -------- */ // 1
11312      /* -------- 0,1010  SMLSL{2} -------- */ // 2
11313      /* -------- 1,1010  UMLSL{2} -------- */ // 2
11314      /* Widens, and size refers to the narrow lanes. */
11315      UInt ks = 3;
11316      switch (opcode) {
11317         case BITS4(1,1,0,0): ks = 0; break;
11318         case BITS4(1,0,0,0): ks = 1; break;
11319         case BITS4(1,0,1,0): ks = 2; break;
11320         default: vassert(0);
11321      }
11322      vassert(ks >= 0 && ks <= 2);
11323      if (size == X11) return False;
11324      vassert(size <= 2);
11325      Bool   isU  = bitU == 1;
11326      IRTemp vecN = newTempV128();
11327      IRTemp vecM = newTempV128();
11328      IRTemp vecD = newTempV128();
11329      assign(vecN, getQReg128(nn));
11330      assign(vecM, getQReg128(mm));
11331      assign(vecD, getQReg128(dd));
11332      IRTemp res = IRTemp_INVALID;
11333      math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
11334                    vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11335      putQReg128(dd, mkexpr(res));
11336      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11337      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11338      const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
11339      DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
11340          nameQReg128(dd), arrWide,
11341          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11342      return True;
11343   }
11344
11345   if (bitU == 0
11346       && (opcode == BITS4(1,1,0,1)
11347           || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
11348      /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
11349      /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
11350      /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
11351      /* Widens, and size refers to the narrow lanes. */
11352      UInt ks = 3;
11353      switch (opcode) {
11354         case BITS4(1,1,0,1): ks = 0; break;
11355         case BITS4(1,0,0,1): ks = 1; break;
11356         case BITS4(1,0,1,1): ks = 2; break;
11357         default: vassert(0);
11358      }
11359      vassert(ks >= 0 && ks <= 2);
11360      if (size == X00 || size == X11) return False;
11361      vassert(size <= 2);
11362      IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
11363      vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11364      newTempsV128_3(&vecN, &vecM, &vecD);
11365      assign(vecN, getQReg128(nn));
11366      assign(vecM, getQReg128(mm));
11367      assign(vecD, getQReg128(dd));
11368      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11369                       is2, size, "mas"[ks],
11370                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11371      putQReg128(dd, mkexpr(res));
11372      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11373      updateQCFLAGwithDifference(sat1q, sat1n);
11374      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11375         updateQCFLAGwithDifference(sat2q, sat2n);
11376      }
11377      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11378      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11379      const HChar* nm        = ks == 0 ? "sqdmull"
11380                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11381      DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11382          nameQReg128(dd), arrWide,
11383          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11384      return True;
11385   }
11386
11387   if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
11388      /* -------- 0,1110  PMULL{2} -------- */
11389      /* Widens, and size refers to the narrow lanes. */
11390      if (size != X00 && size != X11) return False;
11391      IRTemp  res  = IRTemp_INVALID;
11392      IRExpr* srcN = getQReg128(nn);
11393      IRExpr* srcM = getQReg128(mm);
11394      const HChar* arrNarrow = NULL;
11395      const HChar* arrWide   = NULL;
11396      if (size == X00) {
11397         res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
11398                                         srcN, srcM);
11399         arrNarrow = nameArr_Q_SZ(bitQ, size);
11400         arrWide   = nameArr_Q_SZ(1,    size+1);
11401      } else {
11402         /* The same thing as the X00 case, except we have to call
11403            a helper to do it. */
11404         vassert(size == X11);
11405         res = newTemp(Ity_V128);
11406         IROp slice
11407            = is2 ? Iop_V128HIto64 : Iop_V128to64;
11408         IRExpr** args
11409            = mkIRExprVec_3( IRExpr_VECRET(),
11410                             unop(slice, srcN), unop(slice, srcM));
11411         IRDirty* di
11412            = unsafeIRDirty_1_N( res, 0/*regparms*/,
11413                                      "arm64g_dirtyhelper_PMULLQ",
11414                                      &arm64g_dirtyhelper_PMULLQ, args);
11415         stmt(IRStmt_Dirty(di));
11416         /* We can't use nameArr_Q_SZ for this because it can't deal with
11417            Q-sized (128 bit) results.  Hence do it by hand. */
11418         arrNarrow = bitQ == 0 ? "1d" : "2d";
11419         arrWide   = "1q";
11420      }
11421      putQReg128(dd, mkexpr(res));
11422      DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
11423          nameQReg128(dd), arrWide,
11424          nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11425      return True;
11426   }
11427
11428   return False;
11429#  undef INSN
11430}
11431
11432
11433static
11434Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
11435{
11436   /* 31 30 29 28    23   21 20 15     10 9 4
11437      0  Q  U  01110 size 1  m  opcode 1  n d
11438      Decode fields: u,size,opcode
11439   */
11440#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11441   if (INSN(31,31) != 0
11442       || INSN(28,24) != BITS5(0,1,1,1,0)
11443       || INSN(21,21) != 1
11444       || INSN(10,10) != 1) {
11445      return False;
11446   }
11447   UInt bitQ   = INSN(30,30);
11448   UInt bitU   = INSN(29,29);
11449   UInt size   = INSN(23,22);
11450   UInt mm     = INSN(20,16);
11451   UInt opcode = INSN(15,11);
11452   UInt nn     = INSN(9,5);
11453   UInt dd     = INSN(4,0);
11454   vassert(size < 4);
11455
11456   if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
11457      /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
11458      /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
11459      /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
11460      /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
11461      if (size == X11) return False;
11462      Bool isADD = opcode == BITS5(0,0,0,0,0);
11463      Bool isU   = bitU == 1;
11464      /* Widen both args out, do the math, narrow to final result. */
11465      IRTemp argL   = newTempV128();
11466      IRTemp argLhi = IRTemp_INVALID;
11467      IRTemp argLlo = IRTemp_INVALID;
11468      IRTemp argR   = newTempV128();
11469      IRTemp argRhi = IRTemp_INVALID;
11470      IRTemp argRlo = IRTemp_INVALID;
11471      IRTemp resHi  = newTempV128();
11472      IRTemp resLo  = newTempV128();
11473      IRTemp res    = IRTemp_INVALID;
11474      assign(argL, getQReg128(nn));
11475      argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
11476      argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
11477      assign(argR, getQReg128(mm));
11478      argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
11479      argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
11480      IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
11481      IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
11482      assign(resHi, binop(opSxR,
11483                          binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
11484                          mkU8(1)));
11485      assign(resLo, binop(opSxR,
11486                          binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
11487                          mkU8(1)));
11488      res = math_NARROW_LANES ( resHi, resLo, size );
11489      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11490      const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
11491                               : (isU ? "uhsub" : "shsub");
11492      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11493      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11494          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11495      return True;
11496   }
11497
11498   if (opcode == BITS5(0,0,0,1,0)) {
11499      /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
11500      /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
11501      if (bitQ == 0 && size == X11) return False; // implied 1d case
11502      Bool   isU  = bitU == 1;
11503      IRTemp argL = newTempV128();
11504      IRTemp argR = newTempV128();
11505      assign(argL, getQReg128(nn));
11506      assign(argR, getQReg128(mm));
11507      IRTemp res = math_RHADD(size, isU, argL, argR);
11508      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11509      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11510      DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
11511          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11512      return True;
11513   }
11514
11515   if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
11516      /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
11517      /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
11518      /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
11519      /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
11520      if (bitQ == 0 && size == X11) return False; // implied 1d case
11521      Bool isADD = opcode == BITS5(0,0,0,0,1);
11522      Bool isU   = bitU == 1;
11523      IROp qop   = Iop_INVALID;
11524      IROp nop   = Iop_INVALID;
11525      if (isADD) {
11526         qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
11527         nop = mkVecADD(size);
11528      } else {
11529         qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
11530         nop = mkVecSUB(size);
11531      }
11532      IRTemp argL = newTempV128();
11533      IRTemp argR = newTempV128();
11534      IRTemp qres = newTempV128();
11535      IRTemp nres = newTempV128();
11536      assign(argL, getQReg128(nn));
11537      assign(argR, getQReg128(mm));
11538      assign(qres, math_MAYBE_ZERO_HI64_fromE(
11539                      bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
11540      assign(nres, math_MAYBE_ZERO_HI64_fromE(
11541                      bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
11542      putQReg128(dd, mkexpr(qres));
11543      updateQCFLAGwithDifference(qres, nres);
11544      const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
11545                               : (isU ? "uqsub" : "sqsub");
11546      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11547      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11548          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11549      return True;
11550   }
11551
11552   if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
11553      /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
11554      /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
11555      /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
11556      /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
11557      Bool   isORx  = (size & 2) == 2;
11558      Bool   invert = (size & 1) == 1;
11559      IRTemp res    = newTempV128();
11560      assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
11561                        getQReg128(nn),
11562                        invert ? unop(Iop_NotV128, getQReg128(mm))
11563                               : getQReg128(mm)));
11564      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11565      const HChar* names[4] = { "and", "bic", "orr", "orn" };
11566      const HChar* ar = bitQ == 1 ? "16b" : "8b";
11567      DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
11568          nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
11569      return True;
11570   }
11571
11572   if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
11573      /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
11574      /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
11575      /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
11576      /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
11577      IRTemp argD = newTempV128();
11578      IRTemp argN = newTempV128();
11579      IRTemp argM = newTempV128();
11580      assign(argD, getQReg128(dd));
11581      assign(argN, getQReg128(nn));
11582      assign(argM, getQReg128(mm));
11583      const IROp opXOR = Iop_XorV128;
11584      const IROp opAND = Iop_AndV128;
11585      const IROp opNOT = Iop_NotV128;
11586      IRTemp res = newTempV128();
11587      switch (size) {
11588         case BITS2(0,0): /* EOR */
11589            assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
11590            break;
11591         case BITS2(0,1): /* BSL */
11592            assign(res, binop(opXOR, mkexpr(argM),
11593                              binop(opAND,
11594                                    binop(opXOR, mkexpr(argM), mkexpr(argN)),
11595                                          mkexpr(argD))));
11596            break;
11597         case BITS2(1,0): /* BIT */
11598            assign(res, binop(opXOR, mkexpr(argD),
11599                              binop(opAND,
11600                                    binop(opXOR, mkexpr(argD), mkexpr(argN)),
11601                                    mkexpr(argM))));
11602            break;
11603         case BITS2(1,1): /* BIF */
11604            assign(res, binop(opXOR, mkexpr(argD),
11605                              binop(opAND,
11606                                    binop(opXOR, mkexpr(argD), mkexpr(argN)),
11607                                    unop(opNOT, mkexpr(argM)))));
11608            break;
11609         default:
11610            vassert(0);
11611      }
11612      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11613      const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
11614      const HChar* arr = bitQ == 1 ? "16b" : "8b";
11615      DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
11616          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11617      return True;
11618   }
11619
11620   if (opcode == BITS5(0,0,1,1,0)) {
11621      /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
11622      /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
11623      if (bitQ == 0 && size == X11) return False; // implied 1d case
11624      Bool   isGT  = bitU == 0;
11625      IRExpr* argL = getQReg128(nn);
11626      IRExpr* argR = getQReg128(mm);
11627      IRTemp  res  = newTempV128();
11628      assign(res,
11629             isGT ? binop(mkVecCMPGTS(size), argL, argR)
11630                  : binop(mkVecCMPGTU(size), argL, argR));
11631      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11632      const HChar* nm  = isGT ? "cmgt" : "cmhi";
11633      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11634      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11635          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11636      return True;
11637   }
11638
11639   if (opcode == BITS5(0,0,1,1,1)) {
11640      /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
11641      /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
11642      if (bitQ == 0 && size == X11) return False; // implied 1d case
11643      Bool    isGE = bitU == 0;
11644      IRExpr* argL = getQReg128(nn);
11645      IRExpr* argR = getQReg128(mm);
11646      IRTemp  res  = newTempV128();
11647      assign(res,
11648             isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
11649                  : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
11650      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11651      const HChar* nm  = isGE ? "cmge" : "cmhs";
11652      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11653      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11654          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11655      return True;
11656   }
11657
11658   if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
11659      /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
11660      /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
11661      /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
11662      /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
11663      if (bitQ == 0 && size == X11) return False; // implied 1d case
11664      Bool isU = bitU == 1;
11665      Bool isR = opcode == BITS5(0,1,0,1,0);
11666      IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
11667                     : (isU ? mkVecSHU(size)  : mkVecSHS(size));
11668      IRTemp res = newTempV128();
11669      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
11670      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11671      const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
11672                             : (isU ? "ushl"  : "sshl");
11673      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11674      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11675          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11676      return True;
11677   }
11678
11679   if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
11680      /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
11681      /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
11682      /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
11683      /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
11684      if (bitQ == 0 && size == X11) return False; // implied 1d case
11685      Bool isU = bitU == 1;
11686      Bool isR = opcode == BITS5(0,1,0,1,1);
11687      IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
11688                     : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
11689      /* This is a bit tricky.  If we're only interested in the lowest 64 bits
11690         of the result (viz, bitQ == 0), then we must adjust the operands to
11691         ensure that the upper part of the result, that we don't care about,
11692         doesn't pollute the returned Q value.  To do this, zero out the upper
11693         operand halves beforehand.  This works because it means, for the
11694         lanes we don't care about, we are shifting zero by zero, which can
11695         never saturate. */
11696      IRTemp res256 = newTemp(Ity_V256);
11697      IRTemp resSH  = newTempV128();
11698      IRTemp resQ   = newTempV128();
11699      IRTemp zero   = newTempV128();
11700      assign(res256, binop(op,
11701                           math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
11702                           math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
11703      assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
11704      assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
11705      assign(zero,  mkV128(0x0000));
11706      putQReg128(dd, mkexpr(resSH));
11707      updateQCFLAGwithDifference(resQ, zero);
11708      const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
11709                             : (isU ? "uqshl"  : "sqshl");
11710      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11711      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11712          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11713      return True;
11714   }
11715
11716   if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
11717      /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
11718      /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
11719      /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
11720      /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
11721      if (bitQ == 0 && size == X11) return False; // implied 1d case
11722      Bool isU   = bitU == 1;
11723      Bool isMAX = (opcode & 1) == 0;
11724      IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
11725                         : (isU ? mkVecMINU(size) : mkVecMINS(size));
11726      IRTemp t   = newTempV128();
11727      assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
11728      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
11729      const HChar* nm = isMAX ? (isU ? "umax" : "smax")
11730                              : (isU ? "umin" : "smin");
11731      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11732      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11733          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11734      return True;
11735   }
11736
11737   if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
11738      /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
11739      /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
11740      /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
11741      /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
11742      if (size == X11) return False; // 1d/2d cases not allowed
11743      Bool isU   = bitU == 1;
11744      Bool isACC = opcode == BITS5(0,1,1,1,1);
11745      vassert(size <= 2);
11746      IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
11747      IRTemp t2 = newTempV128();
11748      assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
11749                       : mkexpr(t1));
11750      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
11751      const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
11752                               : (isU ? "uabd" : "sabd");
11753      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11754      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11755          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11756      return True;
11757   }
11758
11759   if (opcode == BITS5(1,0,0,0,0)) {
11760      /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
11761      /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
11762      if (bitQ == 0 && size == X11) return False; // implied 1d case
11763      Bool   isSUB = bitU == 1;
11764      IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
11765      IRTemp t     = newTempV128();
11766      assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
11767      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
11768      const HChar* nm  = isSUB ? "sub" : "add";
11769      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11770      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11771          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11772      return True;
11773   }
11774
11775   if (opcode == BITS5(1,0,0,0,1)) {
11776      /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
11777      /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
11778      if (bitQ == 0 && size == X11) return False; // implied 1d case
11779      Bool    isEQ = bitU == 1;
11780      IRExpr* argL = getQReg128(nn);
11781      IRExpr* argR = getQReg128(mm);
11782      IRTemp  res  = newTempV128();
11783      assign(res,
11784             isEQ ? binop(mkVecCMPEQ(size), argL, argR)
11785                  : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
11786                                            binop(Iop_AndV128, argL, argR),
11787                                            mkV128(0x0000))));
11788      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11789      const HChar* nm  = isEQ ? "cmeq" : "cmtst";
11790      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11791      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11792          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11793      return True;
11794   }
11795
11796   if (opcode == BITS5(1,0,0,1,0)) {
11797      /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
11798      /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
11799      if (bitQ == 0 && size == X11) return False; // implied 1d case
11800      Bool isMLS = bitU == 1;
11801      IROp   opMUL    = mkVecMUL(size);
11802      IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
11803      IRTemp res      = newTempV128();
11804      if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
11805         assign(res, binop(opADDSUB,
11806                           getQReg128(dd),
11807                           binop(opMUL, getQReg128(nn), getQReg128(mm))));
11808         putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11809         const HChar* arr = nameArr_Q_SZ(bitQ, size);
11810         DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
11811             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11812         return True;
11813      }
11814      return False;
11815   }
11816
11817   if (opcode == BITS5(1,0,0,1,1)) {
11818      /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
11819      /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
11820      if (bitQ == 0 && size == X11) return False; // implied 1d case
11821      Bool isPMUL = bitU == 1;
11822      const IROp opsPMUL[4]
11823         = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
11824      IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
11825      IRTemp res   = newTempV128();
11826      if (opMUL != Iop_INVALID) {
11827         assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
11828         putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11829         const HChar* arr = nameArr_Q_SZ(bitQ, size);
11830         DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
11831             nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11832         return True;
11833      }
11834      return False;
11835   }
11836
11837   if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
11838      /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
11839      /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
11840      /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
11841      /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
11842      if (size == X11) return False;
11843      Bool isU   = bitU == 1;
11844      Bool isMAX = opcode == BITS5(1,0,1,0,0);
11845      IRTemp vN  = newTempV128();
11846      IRTemp vM  = newTempV128();
11847      IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
11848                      : (isU ? mkVecMINU(size) : mkVecMINS(size));
11849      assign(vN, getQReg128(nn));
11850      assign(vM, getQReg128(mm));
11851      IRTemp res128 = newTempV128();
11852      assign(res128,
11853             binop(op,
11854                   binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
11855                   binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
11856      /* In the half-width case, use CatEL32x4 to extract the half-width
11857         result from the full-width result. */
11858      IRExpr* res
11859         = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
11860                            binop(Iop_CatEvenLanes32x4, mkexpr(res128),
11861                                                        mkexpr(res128)))
11862                     : mkexpr(res128);
11863      putQReg128(dd, res);
11864      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11865      const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
11866                               : (isU ? "uminp" : "sminp");
11867      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11868          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11869      return True;
11870   }
11871
11872   if (opcode == BITS5(1,0,1,1,0)) {
11873      /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
11874      /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
11875      if (size == X00 || size == X11) return False;
11876      Bool isR = bitU == 1;
11877      IRTemp res, sat1q, sat1n, vN, vM;
11878      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11879      newTempsV128_2(&vN, &vM);
11880      assign(vN, getQReg128(nn));
11881      assign(vM, getQReg128(mm));
11882      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11883      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11884      IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
11885      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11886      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11887      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11888      DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11889          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11890      return True;
11891   }
11892
11893   if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
11894      /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
11895      if (bitQ == 0 && size == X11) return False; // implied 1d case
11896      IRTemp vN = newTempV128();
11897      IRTemp vM = newTempV128();
11898      assign(vN, getQReg128(nn));
11899      assign(vM, getQReg128(mm));
11900      IRTemp res128 = newTempV128();
11901      assign(res128,
11902             binop(mkVecADD(size),
11903                   binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
11904                   binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
11905      /* In the half-width case, use CatEL32x4 to extract the half-width
11906         result from the full-width result. */
11907      IRExpr* res
11908         = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
11909                            binop(Iop_CatEvenLanes32x4, mkexpr(res128),
11910                                                        mkexpr(res128)))
11911                     : mkexpr(res128);
11912      putQReg128(dd, res);
11913      const HChar* arr = nameArr_Q_SZ(bitQ, size);
11914      DIP("addp %s.%s, %s.%s, %s.%s\n",
11915          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11916      return True;
11917   }
11918
11919   if (bitU == 0
11920       && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
11921      /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11922      /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11923      /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11924      /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11925      /* FMAXNM, FMINNM: FIXME -- KLUDGED */
11926      Bool   isD   = (size & 1) == 1;
11927      if (bitQ == 0 && isD) return False; // implied 1d case
11928      Bool   isMIN = (size & 2) == 2;
11929      Bool   isNM  = opcode == BITS5(1,1,0,0,0);
11930      IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
11931      IRTemp res   = newTempV128();
11932      assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
11933      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11934      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
11935      DIP("%s%s %s.%s, %s.%s, %s.%s\n",
11936          isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
11937          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11938      return True;
11939   }
11940
11941   if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
11942      /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11943      /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11944      Bool isD   = (size & 1) == 1;
11945      Bool isSUB = (size & 2) == 2;
11946      if (bitQ == 0 && isD) return False; // implied 1d case
11947      IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
11948      IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11949      IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11950      IRTemp rm = mk_get_IR_rounding_mode();
11951      IRTemp t1 = newTempV128();
11952      IRTemp t2 = newTempV128();
11953      // FIXME: double rounding; use FMA primops instead
11954      assign(t1, triop(opMUL,
11955                       mkexpr(rm), getQReg128(nn), getQReg128(mm)));
11956      assign(t2, triop(isSUB ? opSUB : opADD,
11957                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
11958      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
11959      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
11960      DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
11961          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11962      return True;
11963   }
11964
11965   if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
11966      /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11967      /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11968      Bool isD   = (size & 1) == 1;
11969      Bool isSUB = (size & 2) == 2;
11970      if (bitQ == 0 && isD) return False; // implied 1d case
11971      const IROp ops[4]
11972         = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
11973      IROp   op = ops[size];
11974      IRTemp rm = mk_get_IR_rounding_mode();
11975      IRTemp t1 = newTempV128();
11976      IRTemp t2 = newTempV128();
11977      assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
11978      assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
11979      putQReg128(dd, mkexpr(t2));
11980      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
11981      DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
11982          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11983      return True;
11984   }
11985
11986   if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
11987      /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11988      Bool isD = (size & 1) == 1;
11989      if (bitQ == 0 && isD) return False; // implied 1d case
11990      IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11991      IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
11992      IRTemp rm    = mk_get_IR_rounding_mode();
11993      IRTemp t1    = newTempV128();
11994      IRTemp t2    = newTempV128();
11995      // FIXME: use Abd primop instead?
11996      assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
11997      assign(t2, unop(opABS, mkexpr(t1)));
11998      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
11999      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12000      DIP("fabd %s.%s, %s.%s, %s.%s\n",
12001          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12002      return True;
12003   }
12004
12005   if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12006      /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12007      /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12008      // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12009      Bool isD    = (size & 1) == 1;
12010      Bool isMULX = bitU == 0;
12011      if (bitQ == 0 && isD) return False; // implied 1d case
12012      IRTemp rm = mk_get_IR_rounding_mode();
12013      IRTemp t1 = newTempV128();
12014      assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12015                       mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12016      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12017      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12018      DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12019          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12020      return True;
12021   }
12022
12023   if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
12024      /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12025      /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12026      Bool isD = (size & 1) == 1;
12027      if (bitQ == 0 && isD) return False; // implied 1d case
12028      Bool   isGE  = bitU == 1;
12029      IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
12030                          : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
12031      IRTemp t1    = newTempV128();
12032      assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
12033                      : binop(opCMP, getQReg128(nn), getQReg128(mm)));
12034      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12035      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12036      DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
12037          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12038      return True;
12039   }
12040
12041   if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
12042      /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12043      Bool isD = (size & 1) == 1;
12044      if (bitQ == 0 && isD) return False; // implied 1d case
12045      IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12046      IRTemp t1    = newTempV128();
12047      assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
12048      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12049      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12050      DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
12051          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12052      return True;
12053   }
12054
12055   if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
12056      /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12057      /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12058      Bool isD  = (size & 1) == 1;
12059      Bool isGT = (size & 2) == 2;
12060      if (bitQ == 0 && isD) return False; // implied 1d case
12061      IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
12062                          : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
12063      IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12064      IRTemp t1    = newTempV128();
12065      assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
12066                              unop(opABS, getQReg128(nn)))); // swapd
12067      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12068      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12069      DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
12070          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12071      return True;
12072   }
12073
12074   if (bitU == 1
12075       && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12076      /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12077      /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12078      /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12079      /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12080      /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12081      Bool isD = (size & 1) == 1;
12082      if (bitQ == 0 && isD) return False; // implied 1d case
12083      Bool   isMIN = (size & 2) == 2;
12084      Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12085      IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
12086      IRTemp srcN  = newTempV128();
12087      IRTemp srcM  = newTempV128();
12088      IRTemp preL  = IRTemp_INVALID;
12089      IRTemp preR  = IRTemp_INVALID;
12090      assign(srcN, getQReg128(nn));
12091      assign(srcM, getQReg128(mm));
12092      math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12093                                           srcM, srcN, isD, bitQ);
12094      putQReg128(
12095         dd, math_MAYBE_ZERO_HI64_fromE(
12096                bitQ,
12097                binop(opMXX, mkexpr(preL), mkexpr(preR))));
12098      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12099      DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
12100          isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12101          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12102      return True;
12103   }
12104
12105   if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
12106      /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12107      Bool isD = size == X01;
12108      if (bitQ == 0 && isD) return False; // implied 1d case
12109      IRTemp srcN = newTempV128();
12110      IRTemp srcM = newTempV128();
12111      IRTemp preL = IRTemp_INVALID;
12112      IRTemp preR = IRTemp_INVALID;
12113      assign(srcN, getQReg128(nn));
12114      assign(srcM, getQReg128(mm));
12115      math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12116                                           srcM, srcN, isD, bitQ);
12117      putQReg128(
12118         dd, math_MAYBE_ZERO_HI64_fromE(
12119                bitQ,
12120                triop(mkVecADDF(isD ? 3 : 2),
12121                      mkexpr(mk_get_IR_rounding_mode()),
12122                      mkexpr(preL), mkexpr(preR))));
12123      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12124      DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12125          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12126      return True;
12127   }
12128
12129   if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
12130      /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12131      Bool isD = (size & 1) == 1;
12132      if (bitQ == 0 && isD) return False; // implied 1d case
12133      vassert(size <= 1);
12134      const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
12135      IROp   op = ops[size];
12136      IRTemp rm = mk_get_IR_rounding_mode();
12137      IRTemp t1 = newTempV128();
12138      IRTemp t2 = newTempV128();
12139      assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12140      assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12141      putQReg128(dd, mkexpr(t2));
12142      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12143      DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
12144          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12145      return True;
12146   }
12147
12148   if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
12149      /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12150      /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12151      Bool isSQRT = (size & 2) == 2;
12152      Bool isD    = (size & 1) == 1;
12153      if (bitQ == 0 && isD) return False; // implied 1d case
12154      IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
12155                           : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
12156      IRTemp res = newTempV128();
12157      assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12158      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12159      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12160      DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
12161          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12162      return True;
12163   }
12164
12165   return False;
12166#  undef INSN
12167}
12168
12169
12170static
12171Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
12172{
12173   /* 31 30 29 28    23   21    16     11 9 4
12174      0  Q  U  01110 size 10000 opcode 10 n d
12175      Decode fields: U,size,opcode
12176   */
12177#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12178   if (INSN(31,31) != 0
12179       || INSN(28,24) != BITS5(0,1,1,1,0)
12180       || INSN(21,17) != BITS5(1,0,0,0,0)
12181       || INSN(11,10) != BITS2(1,0)) {
12182      return False;
12183   }
12184   UInt bitQ   = INSN(30,30);
12185   UInt bitU   = INSN(29,29);
12186   UInt size   = INSN(23,22);
12187   UInt opcode = INSN(16,12);
12188   UInt nn     = INSN(9,5);
12189   UInt dd     = INSN(4,0);
12190   vassert(size < 4);
12191
12192   if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
12193      /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
12194      /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
12195      /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
12196      const IROp iops[3] = { Iop_Reverse8sIn64_x2,
12197                             Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
12198      vassert(size <= 2);
12199      IRTemp res = newTempV128();
12200      assign(res, unop(iops[size], getQReg128(nn)));
12201      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12202      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12203      DIP("%s %s.%s, %s.%s\n", "rev64",
12204          nameQReg128(dd), arr, nameQReg128(nn), arr);
12205      return True;
12206   }
12207
12208   if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
12209      /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
12210      /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
12211      Bool   isH = size == X01;
12212      IRTemp res = newTempV128();
12213      IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
12214      assign(res, unop(iop, getQReg128(nn)));
12215      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12216      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12217      DIP("%s %s.%s, %s.%s\n", "rev32",
12218          nameQReg128(dd), arr, nameQReg128(nn), arr);
12219      return True;
12220   }
12221
12222   if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
12223      /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
12224      IRTemp res = newTempV128();
12225      assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
12226      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12227      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12228      DIP("%s %s.%s, %s.%s\n", "rev16",
12229          nameQReg128(dd), arr, nameQReg128(nn), arr);
12230      return True;
12231   }
12232
12233   if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
12234      /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
12235      /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
12236      /* -------- 0,xx,00110: SADALP std6_std6 -------- */
12237      /* -------- 1,xx,00110: UADALP std6_std6 -------- */
12238      /* Widens, and size refers to the narrow size. */
12239      if (size == X11) return False; // no 1d or 2d cases
12240      Bool   isU   = bitU == 1;
12241      Bool   isACC = opcode == BITS5(0,0,1,1,0);
12242      IRTemp src   = newTempV128();
12243      IRTemp sum   = newTempV128();
12244      IRTemp res   = newTempV128();
12245      assign(src, getQReg128(nn));
12246      assign(sum,
12247             binop(mkVecADD(size+1),
12248                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12249                             isU, True/*fromOdd*/, size, mkexpr(src))),
12250                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12251                             isU, False/*!fromOdd*/, size, mkexpr(src)))));
12252      assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
12253                        : mkexpr(sum));
12254      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12255      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12256      const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
12257      DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
12258                                     : (isU ? "uaddlp" : "saddlp"),
12259          nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12260      return True;
12261   }
12262
12263   if (opcode == BITS5(0,0,0,1,1)) {
12264      /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
12265      /* -------- 1,xx,00011: USQADD std7_std7 -------- */
12266      if (bitQ == 0 && size == X11) return False; // implied 1d case
12267      Bool isUSQADD = bitU == 1;
12268      /* This is switched (in the US vs SU sense) deliberately.
12269         SUQADD corresponds to the ExtUSsatSS variants and
12270         USQADD corresponds to the ExtSUsatUU variants.
12271         See libvex_ir for more details. */
12272      IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
12273                             : mkVecQADDEXTUSSATSS(size);
12274      IROp   nop  = mkVecADD(size);
12275      IRTemp argL = newTempV128();
12276      IRTemp argR = newTempV128();
12277      IRTemp qres = newTempV128();
12278      IRTemp nres = newTempV128();
12279      /* Because the two arguments to the addition are implicitly
12280         extended differently (one signedly, the other unsignedly) it is
12281         important to present them to the primop in the correct order. */
12282      assign(argL, getQReg128(nn));
12283      assign(argR, getQReg128(dd));
12284      assign(qres, math_MAYBE_ZERO_HI64_fromE(
12285                      bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12286      assign(nres, math_MAYBE_ZERO_HI64_fromE(
12287                      bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12288      putQReg128(dd, mkexpr(qres));
12289      updateQCFLAGwithDifference(qres, nres);
12290      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12291      DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
12292          nameQReg128(dd), arr, nameQReg128(nn), arr);
12293      return True;
12294   }
12295
12296   if (opcode == BITS5(0,0,1,0,0)) {
12297      /* -------- 0,xx,00100: CLS std6_std6 -------- */
12298      /* -------- 1,xx,00100: CLZ std6_std6 -------- */
12299      if (size == X11) return False; // no 1d or 2d cases
12300      const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
12301      const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
12302      Bool   isCLZ = bitU == 1;
12303      IRTemp res   = newTempV128();
12304      vassert(size <= 2);
12305      assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
12306      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12307      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12308      DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
12309          nameQReg128(dd), arr, nameQReg128(nn), arr);
12310      return True;
12311   }
12312
12313   if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
12314      /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
12315      /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
12316      IRTemp res = newTempV128();
12317      assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
12318      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12319      const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12320      DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
12321          nameQReg128(dd), arr, nameQReg128(nn), arr);
12322      return True;
12323   }
12324
12325   if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
12326      /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
12327      IRTemp res = newTempV128();
12328      assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
12329      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12330      const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12331      DIP("%s %s.%s, %s.%s\n", "rbit",
12332          nameQReg128(dd), arr, nameQReg128(nn), arr);
12333      return True;
12334   }
12335
12336   if (opcode == BITS5(0,0,1,1,1)) {
12337      /* -------- 0,xx,00111 SQABS std7_std7 -------- */
12338      /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
12339      if (bitQ == 0 && size == X11) return False; // implied 1d case
12340      Bool   isNEG  = bitU == 1;
12341      IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
12342      (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
12343                                         getQReg128(nn), size );
12344      IRTemp qres = newTempV128(), nres = newTempV128();
12345      assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
12346      assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
12347      putQReg128(dd, mkexpr(qres));
12348      updateQCFLAGwithDifference(qres, nres);
12349      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12350      DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
12351          nameQReg128(dd), arr, nameQReg128(nn), arr);
12352      return True;
12353   }
12354
12355   if (opcode == BITS5(0,1,0,0,0)) {
12356      /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
12357      /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
12358      if (bitQ == 0 && size == X11) return False; // implied 1d case
12359      Bool    isGT  = bitU == 0;
12360      IRExpr* argL  = getQReg128(nn);
12361      IRExpr* argR  = mkV128(0x0000);
12362      IRTemp  res   = newTempV128();
12363      IROp    opGTS = mkVecCMPGTS(size);
12364      assign(res, isGT ? binop(opGTS, argL, argR)
12365                       : unop(Iop_NotV128, binop(opGTS, argR, argL)));
12366      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12367      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12368      DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
12369          nameQReg128(dd), arr, nameQReg128(nn), arr);
12370      return True;
12371   }
12372
12373   if (opcode == BITS5(0,1,0,0,1)) {
12374      /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
12375      /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
12376      if (bitQ == 0 && size == X11) return False; // implied 1d case
12377      Bool    isEQ = bitU == 0;
12378      IRExpr* argL = getQReg128(nn);
12379      IRExpr* argR = mkV128(0x0000);
12380      IRTemp  res  = newTempV128();
12381      assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12382                       : unop(Iop_NotV128,
12383                              binop(mkVecCMPGTS(size), argL, argR)));
12384      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12385      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12386      DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
12387          nameQReg128(dd), arr, nameQReg128(nn), arr);
12388      return True;
12389   }
12390
12391   if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
12392      /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
12393      if (bitQ == 0 && size == X11) return False; // implied 1d case
12394      IRExpr* argL = getQReg128(nn);
12395      IRExpr* argR = mkV128(0x0000);
12396      IRTemp  res  = newTempV128();
12397      assign(res, binop(mkVecCMPGTS(size), argR, argL));
12398      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12399      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12400      DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
12401          nameQReg128(dd), arr, nameQReg128(nn), arr);
12402      return True;
12403   }
12404
12405   if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
12406      /* -------- 0,xx,01011: ABS std7_std7 -------- */
12407      if (bitQ == 0 && size == X11) return False; // implied 1d case
12408      IRTemp res = newTempV128();
12409      assign(res, unop(mkVecABS(size), getQReg128(nn)));
12410      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12411      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12412      DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12413      return True;
12414   }
12415
12416   if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
12417      /* -------- 1,xx,01011: NEG std7_std7 -------- */
12418      if (bitQ == 0 && size == X11) return False; // implied 1d case
12419      IRTemp res = newTempV128();
12420      assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
12421      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12422      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12423      DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12424      return True;
12425   }
12426
12427   UInt ix = 0; /*INVALID*/
12428   if (size >= X10) {
12429      switch (opcode) {
12430         case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
12431         case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
12432         case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
12433         default: break;
12434      }
12435   }
12436   if (ix > 0) {
12437      /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
12438      /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
12439      /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
12440      /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
12441      /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
12442      if (bitQ == 0 && size == X11) return False; // implied 1d case
12443      Bool   isD     = size == X11;
12444      IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
12445      IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
12446      IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12447      IROp   opCmp   = Iop_INVALID;
12448      Bool   swap    = False;
12449      const HChar* nm = "??";
12450      switch (ix) {
12451         case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
12452         case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
12453         case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
12454         case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
12455         case 5: nm = "fcmle"; opCmp = opCmpLE; break;
12456         default: vassert(0);
12457      }
12458      IRExpr* zero = mkV128(0x0000);
12459      IRTemp res = newTempV128();
12460      assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
12461                       : binop(opCmp, getQReg128(nn), zero));
12462      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12463      const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12464      DIP("%s %s.%s, %s.%s, #0.0\n", nm,
12465          nameQReg128(dd), arr, nameQReg128(nn), arr);
12466      return True;
12467   }
12468
12469   if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
12470      /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
12471      /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
12472      if (bitQ == 0 && size == X11) return False; // implied 1d case
12473      Bool   isFNEG = bitU == 1;
12474      IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
12475                             : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
12476      IRTemp res = newTempV128();
12477      assign(res, unop(op, getQReg128(nn)));
12478      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12479      const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12480      DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
12481          nameQReg128(dd), arr, nameQReg128(nn), arr);
12482      return True;
12483   }
12484
12485   if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
12486      /* -------- 0,xx,10010: XTN{,2} -------- */
12487      if (size == X11) return False;
12488      vassert(size < 3);
12489      Bool   is2  = bitQ == 1;
12490      IROp   opN  = mkVecNARROWUN(size);
12491      IRTemp resN = newTempV128();
12492      assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
12493      putLO64andZUorPutHI64(is2, dd, resN);
12494      const HChar* nm        = "xtn";
12495      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12496      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12497      DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12498          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12499      return True;
12500   }
12501
12502   if (opcode == BITS5(1,0,1,0,0)
12503       || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
12504      /* -------- 0,xx,10100: SQXTN{,2} -------- */
12505      /* -------- 1,xx,10100: UQXTN{,2} -------- */
12506      /* -------- 1,xx,10010: SQXTUN{,2} -------- */
12507      if (size == X11) return False;
12508      vassert(size < 3);
12509      Bool  is2    = bitQ == 1;
12510      IROp  opN    = Iop_INVALID;
12511      Bool  zWiden = True;
12512      const HChar* nm = "??";
12513      /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
12514         opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
12515      }
12516      else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
12517         opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
12518      }
12519      else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
12520         opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
12521      }
12522      else vassert(0);
12523      IRTemp src  = newTempV128();
12524      assign(src, getQReg128(nn));
12525      IRTemp resN = newTempV128();
12526      assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
12527      putLO64andZUorPutHI64(is2, dd, resN);
12528      IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
12529                                              size, mkexpr(resN));
12530      updateQCFLAGwithDifference(src, resW);
12531      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12532      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12533      DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12534          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12535      return True;
12536   }
12537
12538   if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
12539      /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
12540      /* Widens, and size is the narrow size. */
12541      if (size == X11) return False;
12542      Bool is2   = bitQ == 1;
12543      IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
12544      IROp opSHL = mkVecSHLN(size+1);
12545      IRTemp src = newTempV128();
12546      IRTemp res = newTempV128();
12547      assign(src, getQReg128(nn));
12548      assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
12549                               mkU8(8 << size)));
12550      putQReg128(dd, mkexpr(res));
12551      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12552      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12553      DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
12554          nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
12555      return True;
12556   }
12557
12558   if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
12559      /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
12560      UInt   nLanes = size == X00 ? 4 : 2;
12561      IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
12562      IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
12563      IRTemp rm     = mk_get_IR_rounding_mode();
12564      IRTemp src[nLanes];
12565      for (UInt i = 0; i < nLanes; i++) {
12566         src[i] = newTemp(srcTy);
12567         assign(src[i], getQRegLane(nn, i, srcTy));
12568      }
12569      for (UInt i = 0; i < nLanes; i++) {
12570         putQRegLane(dd, nLanes * bitQ + i,
12571                         binop(opCvt, mkexpr(rm), mkexpr(src[i])));
12572      }
12573      if (bitQ == 0) {
12574         putQRegLane(dd, 1, mkU64(0));
12575      }
12576      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12577      const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12578      DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12579          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12580      return True;
12581   }
12582
12583   if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
12584      /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
12585      /* Using Irrm_NEAREST here isn't right.  The docs say "round to
12586         odd" but I don't know what that really means. */
12587      IRType srcTy = Ity_F64;
12588      IROp   opCvt = Iop_F64toF32;
12589      IRTemp src[2];
12590      for (UInt i = 0; i < 2; i++) {
12591         src[i] = newTemp(srcTy);
12592         assign(src[i], getQRegLane(nn, i, srcTy));
12593      }
12594      for (UInt i = 0; i < 2; i++) {
12595         putQRegLane(dd, 2 * bitQ + i,
12596                         binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
12597      }
12598      if (bitQ == 0) {
12599         putQRegLane(dd, 1, mkU64(0));
12600      }
12601      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12602      const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12603      DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12604          nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12605      return True;
12606   }
12607
12608   if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
12609      /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
12610      UInt   nLanes = size == X00 ? 4 : 2;
12611      IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
12612      IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
12613      IRTemp src[nLanes];
12614      for (UInt i = 0; i < nLanes; i++) {
12615         src[i] = newTemp(srcTy);
12616         assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
12617      }
12618      for (UInt i = 0; i < nLanes; i++) {
12619         putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
12620      }
12621      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12622      const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12623      DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12624          nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12625      return True;
12626   }
12627
12628   ix = 0;
12629   if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
12630      ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
12631      // = 1 + bitU[0]:size[1]:opcode[0]
12632      vassert(ix >= 1 && ix <= 8);
12633      if (ix == 7) ix = 0;
12634   }
12635   if (ix > 0) {
12636      /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
12637      /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
12638      /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
12639      /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
12640      /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
12641      /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
12642      /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
12643      /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
12644      /* rm plan:
12645         FRINTN: tieeven -- !! FIXME KLUDGED !!
12646         FRINTM: -inf
12647         FRINTP: +inf
12648         FRINTZ: zero
12649         FRINTA: tieaway -- !! FIXME KLUDGED !!
12650         FRINTX: per FPCR + "exact = TRUE"
12651         FRINTI: per FPCR
12652      */
12653      Bool isD = (size & 1) == 1;
12654      if (bitQ == 0 && isD) return False; // implied 1d case
12655
12656      IRTemp irrmRM = mk_get_IR_rounding_mode();
12657
12658      UChar ch = '?';
12659      IRTemp irrm = newTemp(Ity_I32);
12660      switch (ix) {
12661         case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12662         case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
12663         case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
12664         case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
12665         // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
12666         case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12667         // I am unsure about the following, due to the "integral exact"
12668         // description in the manual.  What does it mean? (frintx, that is)
12669         case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
12670         case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
12671         default: vassert(0);
12672      }
12673
12674      IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
12675      if (isD) {
12676         for (UInt i = 0; i < 2; i++) {
12677            putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
12678                                            getQRegLane(nn, i, Ity_F64)));
12679         }
12680      } else {
12681         UInt n = bitQ==1 ? 4 : 2;
12682         for (UInt i = 0; i < n; i++) {
12683            putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
12684                                            getQRegLane(nn, i, Ity_F32)));
12685         }
12686         if (bitQ == 0)
12687            putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
12688      }
12689      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12690      DIP("frint%c %s.%s, %s.%s\n", ch,
12691          nameQReg128(dd), arr, nameQReg128(nn), arr);
12692      return True;
12693   }
12694
12695   ix = 0; /*INVALID*/
12696   switch (opcode) {
12697      case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
12698      case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
12699      case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
12700      default: break;
12701   }
12702   if (ix > 0) {
12703      /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
12704      /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
12705      /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
12706      /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
12707      /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
12708      /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
12709      /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
12710      /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
12711      /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
12712      /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
12713      Bool isD = (size & 1) == 1;
12714      if (bitQ == 0 && isD) return False; // implied 1d case
12715
12716      IRRoundingMode irrm = 8; /*impossible*/
12717      HChar          ch   = '?';
12718      switch (ix) {
12719         case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
12720         case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
12721         case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
12722         case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
12723         case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
12724         default: vassert(0);
12725      }
12726      IROp cvt = Iop_INVALID;
12727      if (bitU == 1) {
12728         cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
12729      } else {
12730         cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
12731      }
12732      if (isD) {
12733         for (UInt i = 0; i < 2; i++) {
12734            putQRegLane(dd, i, binop(cvt, mkU32(irrm),
12735                                            getQRegLane(nn, i, Ity_F64)));
12736         }
12737      } else {
12738         UInt n = bitQ==1 ? 4 : 2;
12739         for (UInt i = 0; i < n; i++) {
12740            putQRegLane(dd, i, binop(cvt, mkU32(irrm),
12741                                            getQRegLane(nn, i, Ity_F32)));
12742         }
12743         if (bitQ == 0)
12744            putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
12745      }
12746      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12747      DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
12748          nameQReg128(dd), arr, nameQReg128(nn), arr);
12749      return True;
12750   }
12751
12752   if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
12753      /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
12754      /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
12755      Bool isREC = bitU == 0;
12756      IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
12757      IRTemp res = newTempV128();
12758      assign(res, unop(op, getQReg128(nn)));
12759      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12760      const HChar* nm  = isREC ? "urecpe" : "ursqrte";
12761      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12762      DIP("%s %s.%s, %s.%s\n", nm,
12763          nameQReg128(dd), arr, nameQReg128(nn), arr);
12764      return True;
12765   }
12766
12767   if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
12768      /* -------- 0,0x,11101: SCVTF -------- */
12769      /* -------- 1,0x,11101: UCVTF -------- */
12770      /* 31  28      22 21       15     9 4
12771         0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
12772         0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
12773         with laneage:
12774         case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
12775      */
12776      Bool isQ   = bitQ == 1;
12777      Bool isU   = bitU == 1;
12778      Bool isF64 = (size & 1) == 1;
12779      if (isQ || !isF64) {
12780         IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
12781         UInt   nLanes = 0;
12782         Bool   zeroHI = False;
12783         const HChar* arrSpec = NULL;
12784         Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
12785                                       isQ, isF64 );
12786         IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
12787                          : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
12788         IRTemp rm  = mk_get_IR_rounding_mode();
12789         UInt   i;
12790         vassert(ok); /* the 'if' above should ensure this */
12791         for (i = 0; i < nLanes; i++) {
12792            putQRegLane(dd, i,
12793                        binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
12794         }
12795         if (zeroHI) {
12796            putQRegLane(dd, 1, mkU64(0));
12797         }
12798         DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
12799             nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
12800         return True;
12801      }
12802      /* else fall through */
12803   }
12804
12805   if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
12806      /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
12807      /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
12808      Bool isSQRT = bitU == 1;
12809      Bool isD    = (size & 1) == 1;
12810      IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
12811                           : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
12812      if (bitQ == 0 && isD) return False; // implied 1d case
12813      IRTemp resV = newTempV128();
12814      assign(resV, unop(op, getQReg128(nn)));
12815      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
12816      const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12817      DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
12818          nameQReg128(dd), arr, nameQReg128(nn), arr);
12819      return True;
12820   }
12821
12822   if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
12823      /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
12824      Bool isD = (size & 1) == 1;
12825      IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
12826      if (bitQ == 0 && isD) return False; // implied 1d case
12827      IRTemp resV = newTempV128();
12828      assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
12829                             getQReg128(nn)));
12830      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
12831      const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12832      DIP("%s %s.%s, %s.%s\n", "fsqrt",
12833          nameQReg128(dd), arr, nameQReg128(nn), arr);
12834      return True;
12835   }
12836
12837   return False;
12838#  undef INSN
12839}
12840
12841
12842static
12843Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
12844{
12845   /* 31    28    23   21 20 19 15     11   9 4
12846      0 Q U 01111 size L  M  m  opcode H  0 n d
12847      Decode fields are: u,size,opcode
12848      M is really part of the mm register number.  Individual
12849      cases need to inspect L and H though.
12850   */
12851#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12852   if (INSN(31,31) != 0
12853       || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
12854      return False;
12855   }
12856   UInt bitQ   = INSN(30,30);
12857   UInt bitU   = INSN(29,29);
12858   UInt size   = INSN(23,22);
12859   UInt bitL   = INSN(21,21);
12860   UInt bitM   = INSN(20,20);
12861   UInt mmLO4  = INSN(19,16);
12862   UInt opcode = INSN(15,12);
12863   UInt bitH   = INSN(11,11);
12864   UInt nn     = INSN(9,5);
12865   UInt dd     = INSN(4,0);
12866   vassert(size < 4);
12867   vassert(bitH < 2 && bitM < 2 && bitL < 2);
12868
12869   if (bitU == 0 && size >= X10
12870       && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
12871      /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12872      /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12873      if (bitQ == 0 && size == X11) return False; // implied 1d case
12874      Bool isD   = (size & 1) == 1;
12875      Bool isSUB = opcode == BITS4(0,1,0,1);
12876      UInt index;
12877      if      (!isD)             index = (bitH << 1) | bitL;
12878      else if (isD && bitL == 0) index = bitH;
12879      else return False; // sz:L == x11 => unallocated encoding
12880      vassert(index < (isD ? 2 : 4));
12881      IRType ity   = isD ? Ity_F64 : Ity_F32;
12882      IRTemp elem  = newTemp(ity);
12883      UInt   mm    = (bitM << 4) | mmLO4;
12884      assign(elem, getQRegLane(mm, index, ity));
12885      IRTemp dupd  = math_DUP_TO_V128(elem, ity);
12886      IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12887      IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12888      IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12889      IRTemp rm    = mk_get_IR_rounding_mode();
12890      IRTemp t1    = newTempV128();
12891      IRTemp t2    = newTempV128();
12892      // FIXME: double rounding; use FMA primops instead
12893      assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
12894      assign(t2, triop(isSUB ? opSUB : opADD,
12895                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12896      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12897      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12898      DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
12899          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
12900          isD ? 'd' : 's', index);
12901      return True;
12902   }
12903
12904   if (size >= X10 && opcode == BITS4(1,0,0,1)) {
12905      /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12906      /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12907      if (bitQ == 0 && size == X11) return False; // implied 1d case
12908      Bool isD    = (size & 1) == 1;
12909      Bool isMULX = bitU == 1;
12910      UInt index;
12911      if      (!isD)             index = (bitH << 1) | bitL;
12912      else if (isD && bitL == 0) index = bitH;
12913      else return False; // sz:L == x11 => unallocated encoding
12914      vassert(index < (isD ? 2 : 4));
12915      IRType ity  = isD ? Ity_F64 : Ity_F32;
12916      IRTemp elem = newTemp(ity);
12917      UInt   mm   = (bitM << 4) | mmLO4;
12918      assign(elem, getQRegLane(mm, index, ity));
12919      IRTemp dupd = math_DUP_TO_V128(elem, ity);
12920      // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12921      IRTemp res  = newTempV128();
12922      assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12923                        mkexpr(mk_get_IR_rounding_mode()),
12924                        getQReg128(nn), mkexpr(dupd)));
12925      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12926      const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12927      DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
12928          isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
12929          nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
12930      return True;
12931   }
12932
12933   if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
12934       || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
12935      /* -------- 1,xx,0000 MLA s/h variants only -------- */
12936      /* -------- 1,xx,0100 MLS s/h variants only -------- */
12937      /* -------- 0,xx,1000 MUL s/h variants only -------- */
12938      Bool isMLA = opcode == BITS4(0,0,0,0);
12939      Bool isMLS = opcode == BITS4(0,1,0,0);
12940      UInt mm    = 32; // invalid
12941      UInt ix    = 16; // invalid
12942      switch (size) {
12943         case X00:
12944            return False; // b case is not allowed
12945         case X01:
12946            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
12947         case X10:
12948            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
12949         case X11:
12950            return False; // d case is not allowed
12951         default:
12952            vassert(0);
12953      }
12954      vassert(mm < 32 && ix < 16);
12955      IROp   opMUL = mkVecMUL(size);
12956      IROp   opADD = mkVecADD(size);
12957      IROp   opSUB = mkVecSUB(size);
12958      HChar  ch    = size == X01 ? 'h' : 's';
12959      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
12960      IRTemp vecD  = newTempV128();
12961      IRTemp vecN  = newTempV128();
12962      IRTemp res   = newTempV128();
12963      assign(vecD, getQReg128(dd));
12964      assign(vecN, getQReg128(nn));
12965      IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
12966      if (isMLA || isMLS) {
12967         assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
12968      } else {
12969         assign(res, prod);
12970      }
12971      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12972      const HChar* arr = nameArr_Q_SZ(bitQ, size);
12973      DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
12974                                                : (isMLS ? "mls" : "mul"),
12975          nameQReg128(dd), arr,
12976          nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
12977      return True;
12978   }
12979
12980   if (opcode == BITS4(1,0,1,0)
12981       || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
12982      /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
12983      /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
12984      /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
12985      /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
12986      /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
12987      /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
12988      /* Widens, and size refers to the narrowed lanes. */
12989      UInt ks = 3;
12990      switch (opcode) {
12991         case BITS4(1,0,1,0): ks = 0; break;
12992         case BITS4(0,0,1,0): ks = 1; break;
12993         case BITS4(0,1,1,0): ks = 2; break;
12994         default: vassert(0);
12995      }
12996      vassert(ks >= 0 && ks <= 2);
12997      Bool isU = bitU == 1;
12998      Bool is2 = bitQ == 1;
12999      UInt mm  = 32; // invalid
13000      UInt ix  = 16; // invalid
13001      switch (size) {
13002         case X00:
13003            return False; // h_b_b[] case is not allowed
13004         case X01:
13005            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13006         case X10:
13007            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13008         case X11:
13009            return False; // q_d_d[] case is not allowed
13010         default:
13011            vassert(0);
13012      }
13013      vassert(mm < 32 && ix < 16);
13014      IRTemp vecN  = newTempV128();
13015      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13016      IRTemp vecD  = newTempV128();
13017      assign(vecN, getQReg128(nn));
13018      assign(vecD, getQReg128(dd));
13019      IRTemp res = IRTemp_INVALID;
13020      math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
13021                    vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13022      putQReg128(dd, mkexpr(res));
13023      const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
13024      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13025      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13026      HChar ch               = size == X01 ? 'h' : 's';
13027      DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13028          isU ? 'u' : 's', nm, is2 ? "2" : "",
13029          nameQReg128(dd), arrWide,
13030          nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13031      return True;
13032   }
13033
13034   if (bitU == 0
13035       && (opcode == BITS4(1,0,1,1)
13036           || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
13037      /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
13038      /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
13039      /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
13040      /* Widens, and size refers to the narrowed lanes. */
13041      UInt ks = 3;
13042      switch (opcode) {
13043         case BITS4(1,0,1,1): ks = 0; break;
13044         case BITS4(0,0,1,1): ks = 1; break;
13045         case BITS4(0,1,1,1): ks = 2; break;
13046         default: vassert(0);
13047      }
13048      vassert(ks >= 0 && ks <= 2);
13049      Bool is2 = bitQ == 1;
13050      UInt mm  = 32; // invalid
13051      UInt ix  = 16; // invalid
13052      switch (size) {
13053         case X00:
13054            return False; // h_b_b[] case is not allowed
13055         case X01:
13056            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13057         case X10:
13058            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13059         case X11:
13060            return False; // q_d_d[] case is not allowed
13061         default:
13062            vassert(0);
13063      }
13064      vassert(mm < 32 && ix < 16);
13065      IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
13066      vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
13067      newTempsV128_2(&vecN, &vecD);
13068      assign(vecN, getQReg128(nn));
13069      IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13070      assign(vecD, getQReg128(dd));
13071      math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
13072                       is2, size, "mas"[ks],
13073                       vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13074      putQReg128(dd, mkexpr(res));
13075      vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
13076      updateQCFLAGwithDifference(sat1q, sat1n);
13077      if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
13078         updateQCFLAGwithDifference(sat2q, sat2n);
13079      }
13080      const HChar* nm        = ks == 0 ? "sqdmull"
13081                                       : (ks == 1 ? "sqdmlal" : "sqdmlsl");
13082      const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13083      const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13084      HChar ch               = size == X01 ? 'h' : 's';
13085      DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13086          nm, is2 ? "2" : "",
13087          nameQReg128(dd), arrWide,
13088          nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13089      return True;
13090   }
13091
13092   if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
13093      /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
13094      /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
13095      UInt mm  = 32; // invalid
13096      UInt ix  = 16; // invalid
13097      switch (size) {
13098         case X00:
13099            return False; // b case is not allowed
13100         case X01:
13101            mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13102         case X10:
13103            mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13104         case X11:
13105            return False; // q case is not allowed
13106         default:
13107            vassert(0);
13108      }
13109      vassert(mm < 32 && ix < 16);
13110      Bool isR = opcode == BITS4(1,1,0,1);
13111      IRTemp res, sat1q, sat1n, vN, vM;
13112      res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13113      vN = newTempV128();
13114      assign(vN, getQReg128(nn));
13115      vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13116      math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13117      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13118      IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13119      updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13120      const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13121      const HChar* arr = nameArr_Q_SZ(bitQ, size);
13122      HChar ch         = size == X01 ? 'h' : 's';
13123      DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13124          nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13125      return True;
13126   }
13127
13128   return False;
13129#  undef INSN
13130}
13131
13132
13133static
13134Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
13135{
13136   /* 31        23   21    16     11 9 4
13137      0100 1110 size 10100 opcode 10 n d
13138      Decode fields are: size,opcode
13139      Size is always 00 in ARMv8, it appears.
13140   */
13141#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13142   if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
13143      || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13144      return False;
13145   }
13146   UInt size   = INSN(23,22);
13147   UInt opcode = INSN(16,12);
13148   UInt nn     = INSN(9,5);
13149   UInt dd     = INSN(4,0);
13150
13151   if (size == BITS2(0,0)
13152       && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
13153      /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
13154      /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
13155      Bool   isD  = opcode == BITS5(0,0,1,0,1);
13156      IRTemp op1  = newTemp(Ity_V128);
13157      IRTemp op2  = newTemp(Ity_V128);
13158      IRTemp xord = newTemp(Ity_V128);
13159      IRTemp res  = newTemp(Ity_V128);
13160      void*        helper = isD ? &arm64g_dirtyhelper_AESD
13161                                : &arm64g_dirtyhelper_AESE;
13162      const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
13163                                : "arm64g_dirtyhelper_AESE";
13164      assign(op1, getQReg128(dd));
13165      assign(op2, getQReg128(nn));
13166      assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
13167      IRDirty* di
13168         = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13169                              mkIRExprVec_3(
13170                                 IRExpr_VECRET(),
13171                                 unop(Iop_V128HIto64, mkexpr(xord)),
13172                                 unop(Iop_V128to64, mkexpr(xord)) ) );
13173      stmt(IRStmt_Dirty(di));
13174      putQReg128(dd, mkexpr(res));
13175      DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
13176                                    nameQReg128(dd), nameQReg128(nn));
13177      return True;
13178   }
13179
13180   if (size == BITS2(0,0)
13181       && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
13182      /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
13183      /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
13184      Bool   isI  = opcode == BITS5(0,0,1,1,1);
13185      IRTemp src  = newTemp(Ity_V128);
13186      IRTemp res  = newTemp(Ity_V128);
13187      void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
13188                                : &arm64g_dirtyhelper_AESMC;
13189      const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
13190                                : "arm64g_dirtyhelper_AESMC";
13191      assign(src, getQReg128(nn));
13192      IRDirty* di
13193         = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13194                              mkIRExprVec_3(
13195                                 IRExpr_VECRET(),
13196                                 unop(Iop_V128HIto64, mkexpr(src)),
13197                                 unop(Iop_V128to64, mkexpr(src)) ) );
13198      stmt(IRStmt_Dirty(di));
13199      putQReg128(dd, mkexpr(res));
13200      DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
13201                                    nameQReg128(dd), nameQReg128(nn));
13202      return True;
13203   }
13204
13205   return False;
13206#  undef INSN
13207}
13208
13209
13210static
13211Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13212{
13213   /* 31   28   23 21 20 15 14  11 9 4
13214      0101 1110 sz 0  m  0  opc 00 n d
13215      Decode fields are: sz,opc
13216   */
13217#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13218   if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
13219       || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
13220      return False;
13221   }
13222   UInt sz  = INSN(23,22);
13223   UInt mm  = INSN(20,16);
13224   UInt opc = INSN(14,12);
13225   UInt nn  = INSN(9,5);
13226   UInt dd  = INSN(4,0);
13227   if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
13228      /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
13229      /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
13230      /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
13231      /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
13232      /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
13233      /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
13234      /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
13235      vassert(opc < 7);
13236      const HChar* inames[7]
13237         = { "sha1c", "sha1p", "sha1m", "sha1su0",
13238             "sha256h", "sha256h2", "sha256su1" };
13239      void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
13240         = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
13241             &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
13242             &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
13243             &arm64g_dirtyhelper_SHA256SU1 };
13244      const HChar* hnames[7]
13245         = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
13246             "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
13247             "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
13248             "arm64g_dirtyhelper_SHA256SU1" };
13249      IRTemp vD      = newTemp(Ity_V128);
13250      IRTemp vN      = newTemp(Ity_V128);
13251      IRTemp vM      = newTemp(Ity_V128);
13252      IRTemp vDhi    = newTemp(Ity_I64);
13253      IRTemp vDlo    = newTemp(Ity_I64);
13254      IRTemp vNhiPre = newTemp(Ity_I64);
13255      IRTemp vNloPre = newTemp(Ity_I64);
13256      IRTemp vNhi    = newTemp(Ity_I64);
13257      IRTemp vNlo    = newTemp(Ity_I64);
13258      IRTemp vMhi    = newTemp(Ity_I64);
13259      IRTemp vMlo    = newTemp(Ity_I64);
13260      assign(vD,      getQReg128(dd));
13261      assign(vN,      getQReg128(nn));
13262      assign(vM,      getQReg128(mm));
13263      assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
13264      assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
13265      assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
13266      assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
13267      assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
13268      assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
13269      /* Mask off any bits of the N register operand that aren't actually
13270         needed, so that Memcheck doesn't complain unnecessarily. */
13271      switch (opc) {
13272         case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13273            assign(vNhi, mkU64(0));
13274            assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
13275            break;
13276         case BITS3(0,1,1): case BITS3(1,0,0):
13277         case BITS3(1,0,1): case BITS3(1,1,0):
13278            assign(vNhi, mkexpr(vNhiPre));
13279            assign(vNlo, mkexpr(vNloPre));
13280            break;
13281         default:
13282            vassert(0);
13283      }
13284      IRTemp res = newTemp(Ity_V128);
13285      IRDirty* di
13286         = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
13287                              mkIRExprVec_7(
13288                                 IRExpr_VECRET(),
13289                                 mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
13290                                 mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
13291      stmt(IRStmt_Dirty(di));
13292      putQReg128(dd, mkexpr(res));
13293      switch (opc) {
13294         case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13295            DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
13296            break;
13297         case BITS3(0,1,1): case BITS3(1,1,0):
13298            DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
13299            break;
13300         case BITS3(1,0,0): case BITS3(1,0,1):
13301            DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
13302            break;
13303         default:
13304            vassert(0);
13305      }
13306      return True;
13307   }
13308
13309   return False;
13310#  undef INSN
13311}
13312
13313
13314static
13315Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13316{
13317   /* 31   28   23 21    16  11 9 4
13318      0101 1110 sz 10100 opc 10 n d
13319      Decode fields are: sz,opc
13320   */
13321#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13322   if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
13323       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13324      return False;
13325   }
13326   UInt sz  = INSN(23,22);
13327   UInt opc = INSN(16,12);
13328   UInt nn  = INSN(9,5);
13329   UInt dd  = INSN(4,0);
13330   if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
13331      /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
13332      /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
13333      /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
13334      vassert(opc < 3);
13335      const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
13336      IRTemp vD   = newTemp(Ity_V128);
13337      IRTemp vN   = newTemp(Ity_V128);
13338      IRTemp vDhi = newTemp(Ity_I64);
13339      IRTemp vDlo = newTemp(Ity_I64);
13340      IRTemp vNhi = newTemp(Ity_I64);
13341      IRTemp vNlo = newTemp(Ity_I64);
13342      assign(vD,   getQReg128(dd));
13343      assign(vN,   getQReg128(nn));
13344      assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
13345      assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
13346      assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
13347      assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
13348      /* Mask off any bits of the N register operand that aren't actually
13349         needed, so that Memcheck doesn't complain unnecessarily.  Also
13350         construct the calls, given that the helper functions don't take
13351         the same number of arguments. */
13352      IRDirty* di  = NULL;
13353      IRTemp   res = newTemp(Ity_V128);
13354      switch (opc) {
13355         case BITS5(0,0,0,0,0): {
13356            IRExpr* vNloMasked = unop(Iop_32Uto64,
13357                                      unop(Iop_64to32, mkexpr(vNlo)));
13358            di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13359                                    "arm64g_dirtyhelper_SHA1H",
13360                                    &arm64g_dirtyhelper_SHA1H,
13361                                    mkIRExprVec_3(
13362                                       IRExpr_VECRET(),
13363                                       mkU64(0), vNloMasked) );
13364            break;
13365         }
13366         case BITS5(0,0,0,0,1):
13367            di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13368                                    "arm64g_dirtyhelper_SHA1SU1",
13369                                    &arm64g_dirtyhelper_SHA1SU1,
13370                                    mkIRExprVec_5(
13371                                       IRExpr_VECRET(),
13372                                       mkexpr(vDhi), mkexpr(vDlo),
13373                                       mkexpr(vNhi), mkexpr(vNlo)) );
13374            break;
13375         case BITS5(0,0,0,1,0):
13376            di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13377                                    "arm64g_dirtyhelper_SHA256SU0",
13378                                    &arm64g_dirtyhelper_SHA256SU0,
13379                                    mkIRExprVec_5(
13380                                       IRExpr_VECRET(),
13381                                       mkexpr(vDhi), mkexpr(vDlo),
13382                                       mkexpr(vNhi), mkexpr(vNlo)) );
13383            break;
13384         default:
13385            vassert(0);
13386      }
13387      stmt(IRStmt_Dirty(di));
13388      putQReg128(dd, mkexpr(res));
13389      switch (opc) {
13390         case BITS5(0,0,0,0,0):
13391            DIP("%s s%u, s%u\n", inames[opc], dd, nn);
13392            break;
13393         case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
13394            DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
13395            break;
13396         default:
13397            vassert(0);
13398      }
13399      return True;
13400   }
13401
13402   return False;
13403#  undef INSN
13404}
13405
13406
13407static
13408Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13409{
13410   /* 31  28    23 21 20 15 13   9 4
13411      000 11110 ty 1  m  op 1000 n opcode2
13412      The first 3 bits are really "M 0 S", but M and S are always zero.
13413      Decode fields are: ty,op,opcode2
13414   */
13415#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13416   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13417       || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
13418      return False;
13419   }
13420   UInt ty      = INSN(23,22);
13421   UInt mm      = INSN(20,16);
13422   UInt op      = INSN(15,14);
13423   UInt nn      = INSN(9,5);
13424   UInt opcode2 = INSN(4,0);
13425   vassert(ty < 4);
13426
13427   if (ty <= X01 && op == X00
13428       && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
13429      /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
13430      /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
13431      /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
13432      /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
13433      /* 31        23   20    15      9 4
13434         000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
13435         000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
13436         000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
13437         000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
13438
13439         000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
13440         000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
13441         000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
13442         000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
13443
13444         FCMPE generates Invalid Operation exn if either arg is any kind
13445         of NaN.  FCMP generates Invalid Operation exn if either arg is a
13446         signalling NaN.  We ignore this detail here and produce the same
13447         IR for both.
13448      */
13449      Bool   isD     = (ty & 1) == 1;
13450      Bool   isCMPE  = (opcode2 & 16) == 16;
13451      Bool   cmpZero = (opcode2 & 8) == 8;
13452      IRType ity     = isD ? Ity_F64 : Ity_F32;
13453      Bool   valid   = True;
13454      if (cmpZero && mm != 0) valid = False;
13455      if (valid) {
13456         IRTemp argL  = newTemp(ity);
13457         IRTemp argR  = newTemp(ity);
13458         IRTemp irRes = newTemp(Ity_I32);
13459         assign(argL, getQRegLO(nn, ity));
13460         assign(argR,
13461                cmpZero
13462                   ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
13463                   : getQRegLO(mm, ity));
13464         assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13465                             mkexpr(argL), mkexpr(argR)));
13466         IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13467         IRTemp nzcv_28x0 = newTemp(Ity_I64);
13468         assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
13469         setFlags_COPY(nzcv_28x0);
13470         DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
13471             cmpZero ? "#0.0" : nameQRegLO(mm, ity));
13472         return True;
13473      }
13474      return False;
13475   }
13476
13477   return False;
13478#  undef INSN
13479}
13480
13481
13482static
13483Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13484{
13485   /* 31  28    23 21 20 15   11 9 4  3
13486      000 11110 ty 1  m  cond 01 n op nzcv
13487      The first 3 bits are really "M 0 S", but M and S are always zero.
13488      Decode fields are: ty,op
13489   */
13490#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13491   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13492       || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
13493      return False;
13494   }
13495   UInt ty   = INSN(23,22);
13496   UInt mm   = INSN(20,16);
13497   UInt cond = INSN(15,12);
13498   UInt nn   = INSN(9,5);
13499   UInt op   = INSN(4,4);
13500   UInt nzcv = INSN(3,0);
13501   vassert(ty < 4 && op <= 1);
13502
13503   if (ty <= BITS2(0,1)) {
13504      /* -------- 00,0 FCCMP  s_s -------- */
13505      /* -------- 00,1 FCCMPE s_s -------- */
13506      /* -------- 01,0 FCCMP  d_d -------- */
13507      /* -------- 01,1 FCCMPE d_d -------- */
13508
13509      /* FCCMPE generates Invalid Operation exn if either arg is any kind
13510         of NaN.  FCCMP generates Invalid Operation exn if either arg is a
13511         signalling NaN.  We ignore this detail here and produce the same
13512         IR for both.
13513      */
13514      Bool   isD    = (ty & 1) == 1;
13515      Bool   isCMPE = op == 1;
13516      IRType ity    = isD ? Ity_F64 : Ity_F32;
13517      IRTemp argL   = newTemp(ity);
13518      IRTemp argR   = newTemp(ity);
13519      IRTemp irRes  = newTemp(Ity_I32);
13520      assign(argL,  getQRegLO(nn, ity));
13521      assign(argR,  getQRegLO(mm, ity));
13522      assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13523                          mkexpr(argL), mkexpr(argR)));
13524      IRTemp condT = newTemp(Ity_I1);
13525      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
13526      IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13527
13528      IRTemp nzcvT_28x0 = newTemp(Ity_I64);
13529      assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
13530
13531      IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
13532
13533      IRTemp nzcv_28x0 = newTemp(Ity_I64);
13534      assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
13535                                   mkexpr(nzcvT_28x0), nzcvF_28x0));
13536      setFlags_COPY(nzcv_28x0);
13537      DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
13538          nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
13539      return True;
13540   }
13541
13542   return False;
13543#  undef INSN
13544}
13545
13546
13547static
13548Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
13549{
13550   /* 31        23 21 20 15   11 9 5
13551      000 11110 ty 1  m  cond 11 n d
13552      The first 3 bits are really "M 0 S", but M and S are always zero.
13553      Decode fields: ty
13554   */
13555#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13556   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
13557       || INSN(11,10) != BITS2(1,1)) {
13558      return False;
13559   }
13560   UInt ty   = INSN(23,22);
13561   UInt mm   = INSN(20,16);
13562   UInt cond = INSN(15,12);
13563   UInt nn   = INSN(9,5);
13564   UInt dd   = INSN(4,0);
13565   if (ty <= X01) {
13566      /* -------- 00: FCSEL s_s -------- */
13567      /* -------- 00: FCSEL d_d -------- */
13568      IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13569      IRTemp srcT = newTemp(ity);
13570      IRTemp srcF = newTemp(ity);
13571      IRTemp res  = newTemp(ity);
13572      assign(srcT, getQRegLO(nn, ity));
13573      assign(srcF, getQRegLO(mm, ity));
13574      assign(res, IRExpr_ITE(
13575                     unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
13576                     mkexpr(srcT), mkexpr(srcF)));
13577      putQReg128(dd, mkV128(0x0000));
13578      putQRegLO(dd, mkexpr(res));
13579      DIP("fcsel %s, %s, %s, %s\n",
13580          nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
13581          nameCC(cond));
13582      return True;
13583   }
13584   return False;
13585#  undef INSN
13586}
13587
13588
13589static
13590Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
13591{
13592   /* 31  28    23 21 20     14    9 4
13593      000 11110 ty 1  opcode 10000 n d
13594      The first 3 bits are really "M 0 S", but M and S are always zero.
13595      Decode fields: ty,opcode
13596   */
13597#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13598   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13599       || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
13600      return False;
13601   }
13602   UInt ty     = INSN(23,22);
13603   UInt opcode = INSN(20,15);
13604   UInt nn     = INSN(9,5);
13605   UInt dd     = INSN(4,0);
13606
13607   if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
13608      /* -------- 0x,000000: FMOV  d_d, s_s -------- */
13609      /* -------- 0x,000001: FABS  d_d, s_s -------- */
13610      /* -------- 0x,000010: FNEG  d_d, s_s -------- */
13611      /* -------- 0x,000011: FSQRT d_d, s_s -------- */
13612      IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13613      IRTemp src = newTemp(ity);
13614      IRTemp res = newTemp(ity);
13615      const HChar* nm = "??";
13616      assign(src, getQRegLO(nn, ity));
13617      switch (opcode) {
13618         case BITS6(0,0,0,0,0,0):
13619            nm = "fmov"; assign(res, mkexpr(src)); break;
13620         case BITS6(0,0,0,0,0,1):
13621            nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
13622         case BITS6(0,0,0,0,1,0):
13623            nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
13624         case BITS6(0,0,0,0,1,1):
13625            nm = "fsqrt";
13626            assign(res, binop(mkSQRTF(ity),
13627                              mkexpr(mk_get_IR_rounding_mode()),
13628                              mkexpr(src))); break;
13629         default:
13630            vassert(0);
13631      }
13632      putQReg128(dd, mkV128(0x0000));
13633      putQRegLO(dd, mkexpr(res));
13634      DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
13635      return True;
13636   }
13637
13638   if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
13639                         || opcode == BITS6(0,0,0,1,0,1)))
13640       || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
13641                         || opcode == BITS6(0,0,0,1,0,1)))
13642       || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
13643                         || opcode == BITS6(0,0,0,1,0,0)))) {
13644      /* -------- 11,000100: FCVT s_h -------- */
13645      /* -------- 11,000101: FCVT d_h -------- */
13646      /* -------- 00,000111: FCVT h_s -------- */
13647      /* -------- 00,000101: FCVT d_s -------- */
13648      /* -------- 01,000111: FCVT h_d -------- */
13649      /* -------- 01,000100: FCVT s_d -------- */
13650      /* 31        23 21    16 14    9 4
13651         000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
13652         --------- 11 ----- 01 ---------   FCVT Dd, Hn
13653         --------- 00 ----- 11 ---------   FCVT Hd, Sn
13654         --------- 00 ----- 01 ---------   FCVT Dd, Sn
13655         --------- 01 ----- 11 ---------   FCVT Hd, Dn
13656         --------- 01 ----- 00 ---------   FCVT Sd, Dn
13657         Rounding, when dst is smaller than src, is per the FPCR.
13658      */
13659      UInt b2322 = ty;
13660      UInt b1615 = opcode & BITS2(1,1);
13661      switch ((b2322 << 2) | b1615) {
13662         case BITS4(0,0,0,1):   // S -> D
13663         case BITS4(1,1,0,1): { // H -> D
13664            Bool   srcIsH = b2322 == BITS2(1,1);
13665            IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
13666            IRTemp res    = newTemp(Ity_F64);
13667            assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
13668                             getQRegLO(nn, srcTy)));
13669            putQReg128(dd, mkV128(0x0000));
13670            putQRegLO(dd, mkexpr(res));
13671            DIP("fcvt %s, %s\n",
13672                nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
13673            return True;
13674         }
13675         case BITS4(0,1,0,0):   // D -> S
13676         case BITS4(0,1,1,1): { // D -> H
13677            Bool   dstIsH = b1615 == BITS2(1,1);
13678            IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
13679            IRTemp res    = newTemp(dstTy);
13680            assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
13681                              mkexpr(mk_get_IR_rounding_mode()),
13682                              getQRegLO(nn, Ity_F64)));
13683            putQReg128(dd, mkV128(0x0000));
13684            putQRegLO(dd, mkexpr(res));
13685            DIP("fcvt %s, %s\n",
13686                nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
13687            return True;
13688         }
13689         case BITS4(0,0,1,1):   // S -> H
13690         case BITS4(1,1,0,0): { // H -> S
13691            Bool   toH   = b1615 == BITS2(1,1);
13692            IRType srcTy = toH ? Ity_F32 : Ity_F16;
13693            IRType dstTy = toH ? Ity_F16 : Ity_F32;
13694            IRTemp res = newTemp(dstTy);
13695            if (toH) {
13696               assign(res, binop(Iop_F32toF16,
13697                                 mkexpr(mk_get_IR_rounding_mode()),
13698                                 getQRegLO(nn, srcTy)));
13699
13700            } else {
13701               assign(res, unop(Iop_F16toF32,
13702                                getQRegLO(nn, srcTy)));
13703            }
13704            putQReg128(dd, mkV128(0x0000));
13705            putQRegLO(dd, mkexpr(res));
13706            DIP("fcvt %s, %s\n",
13707                nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
13708            return True;
13709         }
13710         default:
13711            break;
13712      }
13713      /* else unhandled */
13714      return False;
13715   }
13716
13717   if (ty <= X01
13718       && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
13719       && opcode != BITS6(0,0,1,1,0,1)) {
13720      /* -------- 0x,001000 FRINTN d_d, s_s -------- */
13721      /* -------- 0x,001001 FRINTP d_d, s_s -------- */
13722      /* -------- 0x,001010 FRINTM d_d, s_s -------- */
13723      /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
13724      /* -------- 0x,001100 FRINTA d_d, s_s -------- */
13725      /* -------- 0x,001110 FRINTX d_d, s_s -------- */
13726      /* -------- 0x,001111 FRINTI d_d, s_s -------- */
13727      /* 31        23 21   17  14    9 4
13728         000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
13729                           rm
13730         x==0 => S-registers, x==1 => D-registers
13731         rm (17:15) encodings:
13732            111 per FPCR  (FRINTI)
13733            001 +inf      (FRINTP)
13734            010 -inf      (FRINTM)
13735            011 zero      (FRINTZ)
13736            000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
13737            100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
13738            110 per FPCR + "exact = TRUE" (FRINTX)
13739            101 unallocated
13740      */
13741      Bool    isD   = (ty & 1) == 1;
13742      UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
13743      IRType  ity   = isD ? Ity_F64 : Ity_F32;
13744      IRExpr* irrmE = NULL;
13745      UChar   ch    = '?';
13746      switch (rm) {
13747         case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
13748         case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
13749         case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
13750         // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
13751         case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
13752         // I am unsure about the following, due to the "integral exact"
13753         // description in the manual.  What does it mean? (frintx, that is)
13754         case BITS3(1,1,0):
13755            ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
13756         case BITS3(1,1,1):
13757            ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
13758         // The following is a kludge.  There's no Irrm_ value to represent
13759         // this ("to nearest, with ties to even")
13760         case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
13761         default: break;
13762      }
13763      if (irrmE) {
13764         IRTemp src = newTemp(ity);
13765         IRTemp dst = newTemp(ity);
13766         assign(src, getQRegLO(nn, ity));
13767         assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
13768                           irrmE, mkexpr(src)));
13769         putQReg128(dd, mkV128(0x0000));
13770         putQRegLO(dd, mkexpr(dst));
13771         DIP("frint%c %s, %s\n",
13772             ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
13773         return True;
13774      }
13775      return False;
13776   }
13777
13778   return False;
13779#  undef INSN
13780}
13781
13782
13783static
13784Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
13785{
13786   /* 31  28    23 21 20 15     11 9 4
13787      000 11110 ty 1  m  opcode 10 n d
13788      The first 3 bits are really "M 0 S", but M and S are always zero.
13789      Decode fields: ty, opcode
13790   */
13791#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13792   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13793       || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
13794      return False;
13795   }
13796   UInt ty     = INSN(23,22);
13797   UInt mm     = INSN(20,16);
13798   UInt opcode = INSN(15,12);
13799   UInt nn     = INSN(9,5);
13800   UInt dd     = INSN(4,0);
13801
13802   if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
13803      /* ------- 0x,0000: FMUL d_d, s_s ------- */
13804      /* ------- 0x,0001: FDIV d_d, s_s ------- */
13805      /* ------- 0x,0010: FADD d_d, s_s ------- */
13806      /* ------- 0x,0011: FSUB d_d, s_s ------- */
13807      /* ------- 0x,0100: FMAX d_d, s_s ------- */
13808      /* ------- 0x,0101: FMIN d_d, s_s ------- */
13809      /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
13810      /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
13811      IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
13812      IROp   iop = Iop_INVALID;
13813      const HChar* nm = "???";
13814      switch (opcode) {
13815         case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
13816         case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
13817         case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
13818         case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
13819         case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
13820         case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
13821         case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
13822         case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
13823         default: vassert(0);
13824      }
13825      if (opcode <= BITS4(0,0,1,1)) {
13826         // This is really not good code.  TODO: avoid width-changing
13827         IRTemp res = newTemp(ity);
13828         assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
13829                                getQRegLO(nn, ity), getQRegLO(mm, ity)));
13830         putQReg128(dd, mkV128(0));
13831         putQRegLO(dd, mkexpr(res));
13832      } else {
13833         putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
13834                             binop(iop, getQReg128(nn), getQReg128(mm))));
13835      }
13836      DIP("%s %s, %s, %s\n",
13837          nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
13838      return True;
13839   }
13840
13841   if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
13842      /* ------- 0x,1000: FNMUL d_d, s_s ------- */
13843      IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
13844      IROp   iop  = mkMULF(ity);
13845      IROp   iopn = mkNEGF(ity);
13846      const HChar* nm = "fnmul";
13847      IRExpr* resE = unop(iopn,
13848                          triop(iop, mkexpr(mk_get_IR_rounding_mode()),
13849                                getQRegLO(nn, ity), getQRegLO(mm, ity)));
13850      IRTemp  res  = newTemp(ity);
13851      assign(res, resE);
13852      putQReg128(dd, mkV128(0));
13853      putQRegLO(dd, mkexpr(res));
13854      DIP("%s %s, %s, %s\n",
13855          nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
13856      return True;
13857   }
13858
13859   return False;
13860#  undef INSN
13861}
13862
13863
13864static
13865Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
13866{
13867   /* 31  28    23 21 20 15 14 9 4
13868      000 11111 ty o1 m  o0 a  n d
13869      The first 3 bits are really "M 0 S", but M and S are always zero.
13870      Decode fields: ty,o1,o0
13871   */
13872#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13873   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
13874      return False;
13875   }
13876   UInt ty    = INSN(23,22);
13877   UInt bitO1 = INSN(21,21);
13878   UInt mm    = INSN(20,16);
13879   UInt bitO0 = INSN(15,15);
13880   UInt aa    = INSN(14,10);
13881   UInt nn    = INSN(9,5);
13882   UInt dd    = INSN(4,0);
13883   vassert(ty < 4);
13884
13885   if (ty <= X01) {
13886      /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
13887      /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
13888      /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
13889      /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
13890      /* -------------------- F{N}M{ADD,SUB} -------------------- */
13891      /* 31          22   20 15 14 9 4   ix
13892         000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
13893         000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
13894         000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
13895         000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
13896         where Fx=Dx when sz=1, Fx=Sx when sz=0
13897
13898                  -----SPEC------    ----IMPL----
13899         fmadd       a +    n * m    a + n * m
13900         fmsub       a + (-n) * m    a - n * m
13901         fnmadd   (-a) + (-n) * m    -(a + n * m)
13902         fnmsub   (-a) +    n * m    -(a - n * m)
13903      */
13904      Bool    isD   = (ty & 1) == 1;
13905      UInt    ix    = (bitO1 << 1) | bitO0;
13906      IRType  ity   = isD ? Ity_F64 : Ity_F32;
13907      IROp    opADD = mkADDF(ity);
13908      IROp    opSUB = mkSUBF(ity);
13909      IROp    opMUL = mkMULF(ity);
13910      IROp    opNEG = mkNEGF(ity);
13911      IRTemp  res   = newTemp(ity);
13912      IRExpr* eA    = getQRegLO(aa, ity);
13913      IRExpr* eN    = getQRegLO(nn, ity);
13914      IRExpr* eM    = getQRegLO(mm, ity);
13915      IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
13916      IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
13917      switch (ix) {
13918         case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
13919         case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
13920         case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
13921         case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
13922         default: vassert(0);
13923      }
13924      putQReg128(dd, mkV128(0x0000));
13925      putQRegLO(dd, mkexpr(res));
13926      const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
13927      DIP("%s %s, %s, %s, %s\n",
13928          names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
13929                     nameQRegLO(mm, ity), nameQRegLO(aa, ity));
13930      return True;
13931   }
13932
13933   return False;
13934#  undef INSN
13935}
13936
13937
13938static
13939Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
13940{
13941   /* 31  28    23 21 20   12  9    4
13942      000 11110 ty 1  imm8 100 imm5 d
13943      The first 3 bits are really "M 0 S", but M and S are always zero.
13944   */
13945#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13946   if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13947       || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
13948      return False;
13949   }
13950   UInt ty     = INSN(23,22);
13951   UInt imm8   = INSN(20,13);
13952   UInt imm5   = INSN(9,5);
13953   UInt dd     = INSN(4,0);
13954
13955   /* ------- 00,00000: FMOV s_imm ------- */
13956   /* ------- 01,00000: FMOV d_imm ------- */
13957   if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
13958      Bool  isD  = (ty & 1) == 1;
13959      ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
13960      if (!isD) {
13961         vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
13962      }
13963      putQReg128(dd, mkV128(0));
13964      putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
13965      DIP("fmov %s, #0x%llx\n",
13966          nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
13967      return True;
13968   }
13969
13970   return False;
13971#  undef INSN
13972}
13973
13974
13975static
13976Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
13977{
13978#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13979   /* 31 30 29 28    23   21 20    18     15    9 4
13980      sf  0  0 11110 type 0  rmode opcode scale n d
13981      The first 3 bits are really "sf 0 S", but S is always zero.
13982      Decode fields: sf,type,rmode,opcode
13983   */
13984#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13985   if (INSN(30,29) != BITS2(0,0)
13986       || INSN(28,24) != BITS5(1,1,1,1,0)
13987       || INSN(21,21) != 0) {
13988      return False;
13989   }
13990   UInt bitSF = INSN(31,31);
13991   UInt ty    = INSN(23,22); // type
13992   UInt rm    = INSN(20,19); // rmode
13993   UInt op    = INSN(18,16); // opcode
13994   UInt sc    = INSN(15,10); // scale
13995   UInt nn    = INSN(9,5);
13996   UInt dd    = INSN(4,0);
13997
13998   if (ty <= X01 && rm == X11
13999       && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
14000      /* -------- (ix) sf ty rm opc -------- */
14001      /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
14002      /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
14003      /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
14004      /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
14005
14006      /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
14007      /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
14008      /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
14009      /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
14010      Bool isI64 = bitSF == 1;
14011      Bool isF64 = (ty & 1) == 1;
14012      Bool isU   = (op & 1) == 1;
14013      UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14014
14015      Int fbits = 64 - sc;
14016      vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14017
14018      Double  scale  = two_to_the_plus(fbits);
14019      IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14020                             : IRExpr_Const(IRConst_F32( (Float)scale ));
14021      IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14022
14023      const IROp ops[8]
14024        = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
14025            Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
14026      IRTemp irrm = newTemp(Ity_I32);
14027      assign(irrm, mkU32(Irrm_ZERO));
14028
14029      IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
14030      IRExpr* res = binop(ops[ix], mkexpr(irrm),
14031                                   triop(opMUL, mkexpr(irrm), src, scaleE));
14032      putIRegOrZR(isI64, dd, res);
14033
14034      DIP("fcvtz%c %s, %s, #%d\n",
14035          isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
14036          nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
14037      return True;
14038   }
14039
14040   /* ------ sf,ty,rm,opc ------ */
14041   /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
14042   /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
14043   /* (ix) sf  S 28    ty   rm opc 15    9 4
14044      0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
14045      1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
14046      2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
14047      3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
14048
14049      4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
14050      5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
14051      6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
14052      7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
14053
14054      These are signed/unsigned conversion from integer registers to
14055      FP registers, all 4 32/64-bit combinations, rounded per FPCR,
14056      scaled per |scale|.
14057   */
14058   if (ty <= X01 && rm == X00
14059       && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
14060       && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
14061      Bool isI64 = bitSF == 1;
14062      Bool isF64 = (ty & 1) == 1;
14063      Bool isU   = (op & 1) == 1;
14064      UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14065
14066      Int fbits = 64 - sc;
14067      vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14068
14069      Double  scale  = two_to_the_minus(fbits);
14070      IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14071                             : IRExpr_Const(IRConst_F32( (Float)scale ));
14072      IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14073
14074      const IROp ops[8]
14075        = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14076            Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14077      IRExpr* src = getIRegOrZR(isI64, nn);
14078      IRExpr* res = (isF64 && !isI64)
14079                       ? unop(ops[ix], src)
14080                       : binop(ops[ix],
14081                               mkexpr(mk_get_IR_rounding_mode()), src);
14082      putQReg128(dd, mkV128(0));
14083      putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
14084
14085      DIP("%ccvtf %s, %s, #%d\n",
14086          isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14087          nameIRegOrZR(isI64, nn), fbits);
14088      return True;
14089   }
14090
14091   return False;
14092#  undef INSN
14093}
14094
14095
14096static
14097Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14098{
14099   /* 31 30 29 28    23   21 20    18     15     9 4
14100      sf  0  0 11110 type 1  rmode opcode 000000 n d
14101      The first 3 bits are really "sf 0 S", but S is always zero.
14102      Decode fields: sf,type,rmode,opcode
14103   */
14104#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14105   if (INSN(30,29) != BITS2(0,0)
14106       || INSN(28,24) != BITS5(1,1,1,1,0)
14107       || INSN(21,21) != 1
14108       || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
14109      return False;
14110   }
14111   UInt bitSF = INSN(31,31);
14112   UInt ty    = INSN(23,22); // type
14113   UInt rm    = INSN(20,19); // rmode
14114   UInt op    = INSN(18,16); // opcode
14115   UInt nn    = INSN(9,5);
14116   UInt dd    = INSN(4,0);
14117
14118   // op = 000, 001
14119   /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
14120   /*    30       23   20 18  15     9 4
14121      sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
14122      sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
14123      ---------------- 01 --------------  FCVTP-------- (round to +inf)
14124      ---------------- 10 --------------  FCVTM-------- (round to -inf)
14125      ---------------- 11 --------------  FCVTZ-------- (round to zero)
14126      ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
14127      ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
14128
14129      Rd is Xd when sf==1, Wd when sf==0
14130      Fn is Dn when x==1, Sn when x==0
14131      20:19 carry the rounding mode, using the same encoding as FPCR
14132   */
14133   if (ty <= X01
14134       && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
14135           || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
14136          )
14137      ) {
14138      Bool isI64 = bitSF == 1;
14139      Bool isF64 = (ty & 1) == 1;
14140      Bool isU   = (op & 1) == 1;
14141      /* Decide on the IR rounding mode to use. */
14142      IRRoundingMode irrm = 8; /*impossible*/
14143      HChar ch = '?';
14144      if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
14145         switch (rm) {
14146            case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
14147            case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
14148            case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
14149            case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
14150            default: vassert(0);
14151         }
14152      } else {
14153         vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
14154         switch (rm) {
14155            case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
14156            default: vassert(0);
14157         }
14158      }
14159      vassert(irrm != 8);
14160      /* Decide on the conversion primop, based on the source size,
14161         dest size and signedness (8 possibilities).  Case coding:
14162            F32 ->s I32   0
14163            F32 ->u I32   1
14164            F32 ->s I64   2
14165            F32 ->u I64   3
14166            F64 ->s I32   4
14167            F64 ->u I32   5
14168            F64 ->s I64   6
14169            F64 ->u I64   7
14170      */
14171      UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
14172      vassert(ix < 8);
14173      const IROp iops[8]
14174         = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
14175             Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
14176      IROp iop = iops[ix];
14177      // A bit of ATCery: bounce all cases we haven't seen an example of.
14178      if (/* F32toI32S */
14179             (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
14180          || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
14181          || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
14182          || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
14183          /* F32toI32U */
14184          || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
14185          || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
14186          || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
14187          || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
14188          /* F32toI64S */
14189          || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
14190          || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
14191          || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
14192          || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
14193          /* F32toI64U */
14194          || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
14195          || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
14196          || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
14197          || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
14198          /* F64toI32S */
14199          || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
14200          || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
14201          || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
14202          || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
14203          /* F64toI32U */
14204          || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
14205          || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
14206          || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
14207          || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
14208          /* F64toI64S */
14209          || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
14210          || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
14211          || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
14212          || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
14213          /* F64toI64U */
14214          || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
14215          || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
14216          || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
14217          || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
14218         ) {
14219        /* validated */
14220      } else {
14221        return False;
14222      }
14223      IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
14224      IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
14225      IRTemp src    = newTemp(srcTy);
14226      IRTemp dst    = newTemp(dstTy);
14227      assign(src, getQRegLO(nn, srcTy));
14228      assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
14229      putIRegOrZR(isI64, dd, mkexpr(dst));
14230      DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
14231          nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
14232      return True;
14233   }
14234
14235   // op = 010, 011
14236   /* -------------- {S,U}CVTF (scalar, integer) -------------- */
14237   /* (ix) sf  S 28    ty   rm op  15     9 4
14238      0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
14239      1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
14240      2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
14241      3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
14242
14243      4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
14244      5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
14245      6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
14246      7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
14247
14248      These are signed/unsigned conversion from integer registers to
14249      FP registers, all 4 32/64-bit combinations, rounded per FPCR.
14250   */
14251   if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
14252      Bool isI64 = bitSF == 1;
14253      Bool isF64 = (ty & 1) == 1;
14254      Bool isU   = (op & 1) == 1;
14255      UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14256      const IROp ops[8]
14257        = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14258            Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14259      IRExpr* src = getIRegOrZR(isI64, nn);
14260      IRExpr* res = (isF64 && !isI64)
14261                       ? unop(ops[ix], src)
14262                       : binop(ops[ix],
14263                               mkexpr(mk_get_IR_rounding_mode()), src);
14264      putQReg128(dd, mkV128(0));
14265      putQRegLO(dd, res);
14266      DIP("%ccvtf %s, %s\n",
14267          isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14268          nameIRegOrZR(isI64, nn));
14269      return True;
14270   }
14271
14272   // op = 110, 111
14273   /* -------- FMOV (general) -------- */
14274   /* case sf  S       ty   rm op  15     9 4
14275       (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
14276       (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
14277       (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
14278
14279       (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
14280       (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
14281       (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
14282   */
14283   if (1) {
14284      UInt ix = 0; // case
14285      if (bitSF == 0) {
14286         if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14287            ix = 1;
14288         else
14289         if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14290            ix = 4;
14291      } else {
14292         vassert(bitSF == 1);
14293         if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14294            ix = 2;
14295         else
14296         if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14297            ix = 5;
14298         else
14299         if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
14300            ix = 3;
14301         else
14302         if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
14303            ix = 6;
14304      }
14305      if (ix > 0) {
14306         switch (ix) {
14307            case 1:
14308               putQReg128(dd, mkV128(0));
14309               putQRegLO(dd, getIReg32orZR(nn));
14310               DIP("fmov s%u, w%u\n", dd, nn);
14311               break;
14312            case 2:
14313               putQReg128(dd, mkV128(0));
14314               putQRegLO(dd, getIReg64orZR(nn));
14315               DIP("fmov d%u, x%u\n", dd, nn);
14316               break;
14317            case 3:
14318               putQRegHI64(dd, getIReg64orZR(nn));
14319               DIP("fmov v%u.d[1], x%u\n", dd, nn);
14320               break;
14321            case 4:
14322               putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
14323               DIP("fmov w%u, s%u\n", dd, nn);
14324               break;
14325            case 5:
14326               putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
14327               DIP("fmov x%u, d%u\n", dd, nn);
14328               break;
14329            case 6:
14330               putIReg64orZR(dd, getQRegHI64(nn));
14331               DIP("fmov x%u, v%u.d[1]\n", dd, nn);
14332               break;
14333            default:
14334               vassert(0);
14335         }
14336         return True;
14337      }
14338      /* undecodable; fall through */
14339   }
14340
14341   return False;
14342#  undef INSN
14343}
14344
14345
14346static
14347Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
14348{
14349   Bool ok;
14350   ok = dis_AdvSIMD_EXT(dres, insn);
14351   if (UNLIKELY(ok)) return True;
14352   ok = dis_AdvSIMD_TBL_TBX(dres, insn);
14353   if (UNLIKELY(ok)) return True;
14354   ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
14355   if (UNLIKELY(ok)) return True;
14356   ok = dis_AdvSIMD_across_lanes(dres, insn);
14357   if (UNLIKELY(ok)) return True;
14358   ok = dis_AdvSIMD_copy(dres, insn);
14359   if (UNLIKELY(ok)) return True;
14360   ok = dis_AdvSIMD_modified_immediate(dres, insn);
14361   if (UNLIKELY(ok)) return True;
14362   ok = dis_AdvSIMD_scalar_copy(dres, insn);
14363   if (UNLIKELY(ok)) return True;
14364   ok = dis_AdvSIMD_scalar_pairwise(dres, insn);
14365   if (UNLIKELY(ok)) return True;
14366   ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
14367   if (UNLIKELY(ok)) return True;
14368   ok = dis_AdvSIMD_scalar_three_different(dres, insn);
14369   if (UNLIKELY(ok)) return True;
14370   ok = dis_AdvSIMD_scalar_three_same(dres, insn);
14371   if (UNLIKELY(ok)) return True;
14372   ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
14373   if (UNLIKELY(ok)) return True;
14374   ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
14375   if (UNLIKELY(ok)) return True;
14376   ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
14377   if (UNLIKELY(ok)) return True;
14378   ok = dis_AdvSIMD_three_different(dres, insn);
14379   if (UNLIKELY(ok)) return True;
14380   ok = dis_AdvSIMD_three_same(dres, insn);
14381   if (UNLIKELY(ok)) return True;
14382   ok = dis_AdvSIMD_two_reg_misc(dres, insn);
14383   if (UNLIKELY(ok)) return True;
14384   ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
14385   if (UNLIKELY(ok)) return True;
14386   ok = dis_AdvSIMD_crypto_aes(dres, insn);
14387   if (UNLIKELY(ok)) return True;
14388   ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
14389   if (UNLIKELY(ok)) return True;
14390   ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
14391   if (UNLIKELY(ok)) return True;
14392   ok = dis_AdvSIMD_fp_compare(dres, insn);
14393   if (UNLIKELY(ok)) return True;
14394   ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
14395   if (UNLIKELY(ok)) return True;
14396   ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
14397   if (UNLIKELY(ok)) return True;
14398   ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
14399   if (UNLIKELY(ok)) return True;
14400   ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
14401   if (UNLIKELY(ok)) return True;
14402   ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
14403   if (UNLIKELY(ok)) return True;
14404   ok = dis_AdvSIMD_fp_immediate(dres, insn);
14405   if (UNLIKELY(ok)) return True;
14406   ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
14407   if (UNLIKELY(ok)) return True;
14408   ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
14409   if (UNLIKELY(ok)) return True;
14410   return False;
14411}
14412
14413
14414/*------------------------------------------------------------*/
14415/*--- Disassemble a single ARM64 instruction               ---*/
14416/*------------------------------------------------------------*/
14417
14418/* Disassemble a single ARM64 instruction into IR.  The instruction
14419   has is located at |guest_instr| and has guest IP of
14420   |guest_PC_curr_instr|, which will have been set before the call
14421   here.  Returns True iff the instruction was decoded, in which case
14422   *dres will be set accordingly, or False, in which case *dres should
14423   be ignored by the caller. */
14424
14425static
14426Bool disInstr_ARM64_WRK (
14427        /*MB_OUT*/DisResult* dres,
14428        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
14429        Bool         resteerCisOk,
14430        void*        callback_opaque,
14431        const UChar* guest_instr,
14432        const VexArchInfo* archinfo,
14433        const VexAbiInfo*  abiinfo
14434     )
14435{
14436   // A macro to fish bits out of 'insn'.
14437#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14438
14439//ZZ    DisResult dres;
14440//ZZ    UInt      insn;
14441//ZZ    //Bool      allow_VFP = False;
14442//ZZ    //UInt      hwcaps = archinfo->hwcaps;
14443//ZZ    IRTemp    condT; /* :: Ity_I32 */
14444//ZZ    UInt      summary;
14445//ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
14446//ZZ
14447//ZZ    /* What insn variants are we supporting today? */
14448//ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
14449//ZZ    // etc etc
14450
14451   /* Set result defaults. */
14452   dres->whatNext    = Dis_Continue;
14453   dres->len         = 4;
14454   dres->continueAt  = 0;
14455   dres->jk_StopHere = Ijk_INVALID;
14456   dres->hint        = Dis_HintNone;
14457
14458   /* At least this is simple on ARM64: insns are all 4 bytes long, and
14459      4-aligned.  So just fish the whole thing out of memory right now
14460      and have done. */
14461   UInt insn = getUIntLittleEndianly( guest_instr );
14462
14463   if (0) vex_printf("insn: 0x%x\n", insn);
14464
14465   DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
14466
14467   vassert(0 == (guest_PC_curr_instr & 3ULL));
14468
14469   /* ----------------------------------------------------------- */
14470
14471   /* Spot "Special" instructions (see comment at top of file). */
14472   {
14473      const UChar* code = guest_instr;
14474      /* Spot the 16-byte preamble:
14475            93CC0D8C   ror x12, x12, #3
14476            93CC358C   ror x12, x12, #13
14477            93CCCD8C   ror x12, x12, #51
14478            93CCF58C   ror x12, x12, #61
14479      */
14480      UInt word1 = 0x93CC0D8C;
14481      UInt word2 = 0x93CC358C;
14482      UInt word3 = 0x93CCCD8C;
14483      UInt word4 = 0x93CCF58C;
14484      if (getUIntLittleEndianly(code+ 0) == word1 &&
14485          getUIntLittleEndianly(code+ 4) == word2 &&
14486          getUIntLittleEndianly(code+ 8) == word3 &&
14487          getUIntLittleEndianly(code+12) == word4) {
14488         /* Got a "Special" instruction preamble.  Which one is it? */
14489         if (getUIntLittleEndianly(code+16) == 0xAA0A014A
14490                                               /* orr x10,x10,x10 */) {
14491            /* X3 = client_request ( X4 ) */
14492            DIP("x3 = client_request ( x4 )\n");
14493            putPC(mkU64( guest_PC_curr_instr + 20 ));
14494            dres->jk_StopHere = Ijk_ClientReq;
14495            dres->whatNext    = Dis_StopHere;
14496            return True;
14497         }
14498         else
14499         if (getUIntLittleEndianly(code+16) == 0xAA0B016B
14500                                               /* orr x11,x11,x11 */) {
14501            /* X3 = guest_NRADDR */
14502            DIP("x3 = guest_NRADDR\n");
14503            dres->len = 20;
14504            putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
14505            return True;
14506         }
14507         else
14508         if (getUIntLittleEndianly(code+16) == 0xAA0C018C
14509                                               /* orr x12,x12,x12 */) {
14510            /*  branch-and-link-to-noredir X8 */
14511            DIP("branch-and-link-to-noredir x8\n");
14512            putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
14513            putPC(getIReg64orZR(8));
14514            dres->jk_StopHere = Ijk_NoRedir;
14515            dres->whatNext    = Dis_StopHere;
14516            return True;
14517         }
14518         else
14519         if (getUIntLittleEndianly(code+16) == 0xAA090129
14520                                               /* orr x9,x9,x9 */) {
14521            /* IR injection */
14522            DIP("IR injection\n");
14523            vex_inject_ir(irsb, Iend_LE);
14524            // Invalidate the current insn. The reason is that the IRop we're
14525            // injecting here can change. In which case the translation has to
14526            // be redone. For ease of handling, we simply invalidate all the
14527            // time.
14528            stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
14529            stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
14530            putPC(mkU64( guest_PC_curr_instr + 20 ));
14531            dres->whatNext    = Dis_StopHere;
14532            dres->jk_StopHere = Ijk_InvalICache;
14533            return True;
14534         }
14535         /* We don't know what it is. */
14536         return False;
14537         /*NOTREACHED*/
14538      }
14539   }
14540
14541   /* ----------------------------------------------------------- */
14542
14543   /* Main ARM64 instruction decoder starts here. */
14544
14545   Bool ok = False;
14546
14547   /* insn[28:25] determines the top-level grouping, so let's start
14548      off with that.
14549
14550      For all of these dis_ARM64_ functions, we pass *dres with the
14551      normal default results "insn OK, 4 bytes long, keep decoding" so
14552      they don't need to change it.  However, decodes of control-flow
14553      insns may cause *dres to change.
14554   */
14555   switch (INSN(28,25)) {
14556      case BITS4(1,0,0,0): case BITS4(1,0,0,1):
14557         // Data processing - immediate
14558         ok = dis_ARM64_data_processing_immediate(dres, insn);
14559         break;
14560      case BITS4(1,0,1,0): case BITS4(1,0,1,1):
14561         // Branch, exception generation and system instructions
14562         ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo);
14563         break;
14564      case BITS4(0,1,0,0): case BITS4(0,1,1,0):
14565      case BITS4(1,1,0,0): case BITS4(1,1,1,0):
14566         // Loads and stores
14567         ok = dis_ARM64_load_store(dres, insn, abiinfo);
14568         break;
14569      case BITS4(0,1,0,1): case BITS4(1,1,0,1):
14570         // Data processing - register
14571         ok = dis_ARM64_data_processing_register(dres, insn);
14572         break;
14573      case BITS4(0,1,1,1): case BITS4(1,1,1,1):
14574         // Data processing - SIMD and floating point
14575         ok = dis_ARM64_simd_and_fp(dres, insn);
14576         break;
14577      case BITS4(0,0,0,0): case BITS4(0,0,0,1):
14578      case BITS4(0,0,1,0): case BITS4(0,0,1,1):
14579         // UNALLOCATED
14580         break;
14581      default:
14582         vassert(0); /* Can't happen */
14583   }
14584
14585   /* If the next-level down decoders failed, make sure |dres| didn't
14586      get changed. */
14587   if (!ok) {
14588      vassert(dres->whatNext    == Dis_Continue);
14589      vassert(dres->len         == 4);
14590      vassert(dres->continueAt  == 0);
14591      vassert(dres->jk_StopHere == Ijk_INVALID);
14592   }
14593
14594   return ok;
14595
14596#  undef INSN
14597}
14598
14599
14600/*------------------------------------------------------------*/
14601/*--- Top-level fn                                         ---*/
14602/*------------------------------------------------------------*/
14603
14604/* Disassemble a single instruction into IR.  The instruction
14605   is located in host memory at &guest_code[delta]. */
14606
14607DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
14608                           Bool         (*resteerOkFn) ( void*, Addr ),
14609                           Bool         resteerCisOk,
14610                           void*        callback_opaque,
14611                           const UChar* guest_code_IN,
14612                           Long         delta_IN,
14613                           Addr         guest_IP,
14614                           VexArch      guest_arch,
14615                           const VexArchInfo* archinfo,
14616                           const VexAbiInfo*  abiinfo,
14617                           VexEndness   host_endness_IN,
14618                           Bool         sigill_diag_IN )
14619{
14620   DisResult dres;
14621   vex_bzero(&dres, sizeof(dres));
14622
14623   /* Set globals (see top of this file) */
14624   vassert(guest_arch == VexArchARM64);
14625
14626   irsb                = irsb_IN;
14627   host_endness        = host_endness_IN;
14628   guest_PC_curr_instr = (Addr64)guest_IP;
14629
14630   /* Sanity checks */
14631   /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
14632   vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
14633   vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
14634
14635   /* Try to decode */
14636   Bool ok = disInstr_ARM64_WRK( &dres,
14637                                 resteerOkFn, resteerCisOk, callback_opaque,
14638                                 &guest_code_IN[delta_IN],
14639                                 archinfo, abiinfo );
14640   if (ok) {
14641      /* All decode successes end up here. */
14642      vassert(dres.len == 4 || dres.len == 20);
14643      switch (dres.whatNext) {
14644         case Dis_Continue:
14645            putPC( mkU64(dres.len + guest_PC_curr_instr) );
14646            break;
14647         case Dis_ResteerU:
14648         case Dis_ResteerC:
14649            putPC(mkU64(dres.continueAt));
14650            break;
14651         case Dis_StopHere:
14652            break;
14653         default:
14654            vassert(0);
14655      }
14656      DIP("\n");
14657   } else {
14658      /* All decode failures end up here. */
14659      if (sigill_diag_IN) {
14660         Int   i, j;
14661         UChar buf[64];
14662         UInt  insn
14663                  = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
14664         vex_bzero(buf, sizeof(buf));
14665         for (i = j = 0; i < 32; i++) {
14666            if (i > 0) {
14667              if ((i & 7) == 0) buf[j++] = ' ';
14668              else if ((i & 3) == 0) buf[j++] = '\'';
14669            }
14670            buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
14671         }
14672         vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
14673         vex_printf("disInstr(arm64): %s\n", buf);
14674      }
14675
14676      /* Tell the dispatcher that this insn cannot be decoded, and so
14677         has not been executed, and (is currently) the next to be
14678         executed.  PC should be up-to-date since it is made so at the
14679         start of each insn, but nevertheless be paranoid and update
14680         it again right now. */
14681      putPC( mkU64(guest_PC_curr_instr) );
14682      dres.len         = 0;
14683      dres.whatNext    = Dis_StopHere;
14684      dres.jk_StopHere = Ijk_NoDecode;
14685      dres.continueAt  = 0;
14686   }
14687   return dres;
14688}
14689
14690
14691/*--------------------------------------------------------------------*/
14692/*--- end                                       guest_arm64_toIR.c ---*/
14693/*--------------------------------------------------------------------*/
14694