1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the AArch64TargetLowering class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AArch64ISelLowering.h"
15#include "AArch64PerfectShuffle.h"
16#include "AArch64Subtarget.h"
17#include "AArch64MachineFunctionInfo.h"
18#include "AArch64TargetMachine.h"
19#include "AArch64TargetObjectFile.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/CallingConvLower.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineInstrBuilder.h"
25#include "llvm/CodeGen/MachineRegisterInfo.h"
26#include "llvm/IR/Function.h"
27#include "llvm/IR/Intrinsics.h"
28#include "llvm/IR/Type.h"
29#include "llvm/Support/CommandLine.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/raw_ostream.h"
33#include "llvm/Target/TargetOptions.h"
34using namespace llvm;
35
36#define DEBUG_TYPE "aarch64-lower"
37
38STATISTIC(NumTailCalls, "Number of tail calls");
39STATISTIC(NumShiftInserts, "Number of vector shift inserts");
40
41enum AlignMode {
42  StrictAlign,
43  NoStrictAlign
44};
45
46static cl::opt<AlignMode>
47Align(cl::desc("Load/store alignment support"),
48      cl::Hidden, cl::init(NoStrictAlign),
49      cl::values(
50          clEnumValN(StrictAlign,   "aarch64-strict-align",
51                     "Disallow all unaligned memory accesses"),
52          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
53                     "Allow unaligned memory accesses"),
54          clEnumValEnd));
55
56// Place holder until extr generation is tested fully.
57static cl::opt<bool>
58EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
59                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
60                          cl::init(true));
61
62static cl::opt<bool>
63EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
64                         cl::desc("Allow AArch64 SLI/SRI formation"),
65                         cl::init(false));
66
67//===----------------------------------------------------------------------===//
68// AArch64 Lowering public interface.
69//===----------------------------------------------------------------------===//
70static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
71  if (TT.isOSBinFormatMachO())
72    return new AArch64_MachoTargetObjectFile();
73
74  return new AArch64_ELFTargetObjectFile();
75}
76
77AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
78    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
79  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
80
81  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
82  // we have to make something up. Arbitrarily, choose ZeroOrOne.
83  setBooleanContents(ZeroOrOneBooleanContent);
84  // When comparing vectors the result sets the different elements in the
85  // vector to all-one or all-zero.
86  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
87
88  // Set up the register classes.
89  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
90  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
91
92  if (Subtarget->hasFPARMv8()) {
93    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
94    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
95    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
96    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
97  }
98
99  if (Subtarget->hasNEON()) {
100    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
101    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
102    // Someone set us up the NEON.
103    addDRTypeForNEON(MVT::v2f32);
104    addDRTypeForNEON(MVT::v8i8);
105    addDRTypeForNEON(MVT::v4i16);
106    addDRTypeForNEON(MVT::v2i32);
107    addDRTypeForNEON(MVT::v1i64);
108    addDRTypeForNEON(MVT::v1f64);
109
110    addQRTypeForNEON(MVT::v4f32);
111    addQRTypeForNEON(MVT::v2f64);
112    addQRTypeForNEON(MVT::v16i8);
113    addQRTypeForNEON(MVT::v8i16);
114    addQRTypeForNEON(MVT::v4i32);
115    addQRTypeForNEON(MVT::v2i64);
116  }
117
118  // Compute derived properties from the register classes
119  computeRegisterProperties();
120
121  // Provide all sorts of operation actions
122  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
123  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
124  setOperationAction(ISD::SETCC, MVT::i32, Custom);
125  setOperationAction(ISD::SETCC, MVT::i64, Custom);
126  setOperationAction(ISD::SETCC, MVT::f32, Custom);
127  setOperationAction(ISD::SETCC, MVT::f64, Custom);
128  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
129  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
130  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
131  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
132  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
133  setOperationAction(ISD::SELECT, MVT::i32, Custom);
134  setOperationAction(ISD::SELECT, MVT::i64, Custom);
135  setOperationAction(ISD::SELECT, MVT::f32, Custom);
136  setOperationAction(ISD::SELECT, MVT::f64, Custom);
137  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
138  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
139  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
140  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
141  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
142  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
143
144  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
145  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
146  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
147
148  setOperationAction(ISD::FREM, MVT::f32, Expand);
149  setOperationAction(ISD::FREM, MVT::f64, Expand);
150  setOperationAction(ISD::FREM, MVT::f80, Expand);
151
152  // Custom lowering hooks are needed for XOR
153  // to fold it into CSINC/CSINV.
154  setOperationAction(ISD::XOR, MVT::i32, Custom);
155  setOperationAction(ISD::XOR, MVT::i64, Custom);
156
157  // Virtually no operation on f128 is legal, but LLVM can't expand them when
158  // there's a valid register class, so we need custom operations in most cases.
159  setOperationAction(ISD::FABS, MVT::f128, Expand);
160  setOperationAction(ISD::FADD, MVT::f128, Custom);
161  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
162  setOperationAction(ISD::FCOS, MVT::f128, Expand);
163  setOperationAction(ISD::FDIV, MVT::f128, Custom);
164  setOperationAction(ISD::FMA, MVT::f128, Expand);
165  setOperationAction(ISD::FMUL, MVT::f128, Custom);
166  setOperationAction(ISD::FNEG, MVT::f128, Expand);
167  setOperationAction(ISD::FPOW, MVT::f128, Expand);
168  setOperationAction(ISD::FREM, MVT::f128, Expand);
169  setOperationAction(ISD::FRINT, MVT::f128, Expand);
170  setOperationAction(ISD::FSIN, MVT::f128, Expand);
171  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
172  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
173  setOperationAction(ISD::FSUB, MVT::f128, Custom);
174  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
175  setOperationAction(ISD::SETCC, MVT::f128, Custom);
176  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
177  setOperationAction(ISD::SELECT, MVT::f128, Custom);
178  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
179  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
180
181  // Lowering for many of the conversions is actually specified by the non-f128
182  // type. The LowerXXX function will be trivial when f128 isn't involved.
183  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
184  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
185  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
186  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
187  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
188  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
189  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
190  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
191  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
192  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
193  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
194  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
195  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
196  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
197
198  // Variable arguments.
199  setOperationAction(ISD::VASTART, MVT::Other, Custom);
200  setOperationAction(ISD::VAARG, MVT::Other, Custom);
201  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
202  setOperationAction(ISD::VAEND, MVT::Other, Expand);
203
204  // Variable-sized objects.
205  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
206  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
207  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
208
209  // Exception handling.
210  // FIXME: These are guesses. Has this been defined yet?
211  setExceptionPointerRegister(AArch64::X0);
212  setExceptionSelectorRegister(AArch64::X1);
213
214  // Constant pool entries
215  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
216
217  // BlockAddress
218  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
219
220  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
221  setOperationAction(ISD::ADDC, MVT::i32, Custom);
222  setOperationAction(ISD::ADDE, MVT::i32, Custom);
223  setOperationAction(ISD::SUBC, MVT::i32, Custom);
224  setOperationAction(ISD::SUBE, MVT::i32, Custom);
225  setOperationAction(ISD::ADDC, MVT::i64, Custom);
226  setOperationAction(ISD::ADDE, MVT::i64, Custom);
227  setOperationAction(ISD::SUBC, MVT::i64, Custom);
228  setOperationAction(ISD::SUBE, MVT::i64, Custom);
229
230  // AArch64 lacks both left-rotate and popcount instructions.
231  setOperationAction(ISD::ROTL, MVT::i32, Expand);
232  setOperationAction(ISD::ROTL, MVT::i64, Expand);
233
234  // AArch64 doesn't have {U|S}MUL_LOHI.
235  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
236  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
237
238
239  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
240  // counterparts, which AArch64 supports directly.
241  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
242  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
243  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
244  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
245
246  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
247  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
248
249  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
250  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
251  setOperationAction(ISD::SREM, MVT::i32, Expand);
252  setOperationAction(ISD::SREM, MVT::i64, Expand);
253  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
254  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
255  setOperationAction(ISD::UREM, MVT::i32, Expand);
256  setOperationAction(ISD::UREM, MVT::i64, Expand);
257
258  // Custom lower Add/Sub/Mul with overflow.
259  setOperationAction(ISD::SADDO, MVT::i32, Custom);
260  setOperationAction(ISD::SADDO, MVT::i64, Custom);
261  setOperationAction(ISD::UADDO, MVT::i32, Custom);
262  setOperationAction(ISD::UADDO, MVT::i64, Custom);
263  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
264  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
265  setOperationAction(ISD::USUBO, MVT::i32, Custom);
266  setOperationAction(ISD::USUBO, MVT::i64, Custom);
267  setOperationAction(ISD::SMULO, MVT::i32, Custom);
268  setOperationAction(ISD::SMULO, MVT::i64, Custom);
269  setOperationAction(ISD::UMULO, MVT::i32, Custom);
270  setOperationAction(ISD::UMULO, MVT::i64, Custom);
271
272  setOperationAction(ISD::FSIN, MVT::f32, Expand);
273  setOperationAction(ISD::FSIN, MVT::f64, Expand);
274  setOperationAction(ISD::FCOS, MVT::f32, Expand);
275  setOperationAction(ISD::FCOS, MVT::f64, Expand);
276  setOperationAction(ISD::FPOW, MVT::f32, Expand);
277  setOperationAction(ISD::FPOW, MVT::f64, Expand);
278  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
279  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
280
281  // AArch64 has implementations of a lot of rounding-like FP operations.
282  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
283  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
284    MVT Ty = RoundingTypes[I];
285    setOperationAction(ISD::FFLOOR, Ty, Legal);
286    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
287    setOperationAction(ISD::FCEIL, Ty, Legal);
288    setOperationAction(ISD::FRINT, Ty, Legal);
289    setOperationAction(ISD::FTRUNC, Ty, Legal);
290    setOperationAction(ISD::FROUND, Ty, Legal);
291  }
292
293  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
294
295  if (Subtarget->isTargetMachO()) {
296    // For iOS, we don't want to the normal expansion of a libcall to
297    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
298    // traffic.
299    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
300    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
301  } else {
302    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
303    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
304  }
305
306  // AArch64 does not have floating-point extending loads, i1 sign-extending
307  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
308  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
309  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
310  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
311  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
312  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
313  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
314  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
315  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
316  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
317  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
318  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
319  // Indexed loads and stores are supported.
320  for (unsigned im = (unsigned)ISD::PRE_INC;
321       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
322    setIndexedLoadAction(im, MVT::i8, Legal);
323    setIndexedLoadAction(im, MVT::i16, Legal);
324    setIndexedLoadAction(im, MVT::i32, Legal);
325    setIndexedLoadAction(im, MVT::i64, Legal);
326    setIndexedLoadAction(im, MVT::f64, Legal);
327    setIndexedLoadAction(im, MVT::f32, Legal);
328    setIndexedStoreAction(im, MVT::i8, Legal);
329    setIndexedStoreAction(im, MVT::i16, Legal);
330    setIndexedStoreAction(im, MVT::i32, Legal);
331    setIndexedStoreAction(im, MVT::i64, Legal);
332    setIndexedStoreAction(im, MVT::f64, Legal);
333    setIndexedStoreAction(im, MVT::f32, Legal);
334  }
335
336  // Trap.
337  setOperationAction(ISD::TRAP, MVT::Other, Legal);
338
339  // We combine OR nodes for bitfield operations.
340  setTargetDAGCombine(ISD::OR);
341
342  // Vector add and sub nodes may conceal a high-half opportunity.
343  // Also, try to fold ADD into CSINC/CSINV..
344  setTargetDAGCombine(ISD::ADD);
345  setTargetDAGCombine(ISD::SUB);
346
347  setTargetDAGCombine(ISD::XOR);
348  setTargetDAGCombine(ISD::SINT_TO_FP);
349  setTargetDAGCombine(ISD::UINT_TO_FP);
350
351  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
352
353  setTargetDAGCombine(ISD::ANY_EXTEND);
354  setTargetDAGCombine(ISD::ZERO_EXTEND);
355  setTargetDAGCombine(ISD::SIGN_EXTEND);
356  setTargetDAGCombine(ISD::BITCAST);
357  setTargetDAGCombine(ISD::CONCAT_VECTORS);
358  setTargetDAGCombine(ISD::STORE);
359
360  setTargetDAGCombine(ISD::MUL);
361
362  setTargetDAGCombine(ISD::SELECT);
363  setTargetDAGCombine(ISD::VSELECT);
364
365  setTargetDAGCombine(ISD::INTRINSIC_VOID);
366  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
367  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
368
369  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
370  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
371  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
372
373  setStackPointerRegisterToSaveRestore(AArch64::SP);
374
375  setSchedulingPreference(Sched::Hybrid);
376
377  // Enable TBZ/TBNZ
378  MaskAndBranchFoldingIsLegal = true;
379
380  setMinFunctionAlignment(2);
381
382  RequireStrictAlign = (Align == StrictAlign);
383
384  setHasExtractBitsInsn(true);
385
386  if (Subtarget->hasNEON()) {
387    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
388    // silliness like this:
389    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
390    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
391    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
392    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
393    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
394    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
395    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
396    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
397    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
398    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
399    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
400    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
401    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
402    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
403    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
404    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
405    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
406    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
407    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
408    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
409    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
410    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
411    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
412    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
413    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
414
415    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
416    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
417    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
418    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
419    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
420
421    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
422
423    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
424    // elements smaller than i32, so promote the input to i32 first.
425    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
426    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
427    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
428    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
429    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
430    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
431    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
432    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
433    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
434
435    // AArch64 doesn't have MUL.2d:
436    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
437    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
438    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
439    // Likewise, narrowing and extending vector loads/stores aren't handled
440    // directly.
441    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
442         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
443
444      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
445                         Expand);
446
447      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
448      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
449      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
450      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
451
452      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
453
454      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
455           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
456        setTruncStoreAction((MVT::SimpleValueType)VT,
457                            (MVT::SimpleValueType)InnerVT, Expand);
458      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
459      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
460      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
461    }
462
463    // AArch64 has implementations of a lot of rounding-like FP operations.
464    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
465    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
466      MVT Ty = RoundingVecTypes[I];
467      setOperationAction(ISD::FFLOOR, Ty, Legal);
468      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
469      setOperationAction(ISD::FCEIL, Ty, Legal);
470      setOperationAction(ISD::FRINT, Ty, Legal);
471      setOperationAction(ISD::FTRUNC, Ty, Legal);
472      setOperationAction(ISD::FROUND, Ty, Legal);
473    }
474  }
475}
476
477void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
478  if (VT == MVT::v2f32) {
479    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
480    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
481
482    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
483    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
484  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
485    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
486    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
487
488    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
489    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
490  }
491
492  // Mark vector float intrinsics as expand.
493  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
494    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
495    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
496    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
497    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
498    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
499    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
500    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
501    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
502    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
503  }
504
505  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
506  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
507  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
508  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
509  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
510  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
511  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
512  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
513  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
514  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
515  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
516  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
517
518  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
519  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
520  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
521  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
522
523  // CNT supports only B element sizes.
524  if (VT != MVT::v8i8 && VT != MVT::v16i8)
525    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
526
527  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
528  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
529  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
530  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
531  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
532
533  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
534  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
535
536  if (Subtarget->isLittleEndian()) {
537    for (unsigned im = (unsigned)ISD::PRE_INC;
538         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
539      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
540      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
541    }
542  }
543}
544
545void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
546  addRegisterClass(VT, &AArch64::FPR64RegClass);
547  addTypeForNEON(VT, MVT::v2i32);
548}
549
550void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
551  addRegisterClass(VT, &AArch64::FPR128RegClass);
552  addTypeForNEON(VT, MVT::v4i32);
553}
554
555EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
556  if (!VT.isVector())
557    return MVT::i32;
558  return VT.changeVectorElementTypeToInteger();
559}
560
561/// computeKnownBitsForTargetNode - Determine which of the bits specified in
562/// Mask are known to be either zero or one and return them in the
563/// KnownZero/KnownOne bitsets.
564void AArch64TargetLowering::computeKnownBitsForTargetNode(
565    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
566    const SelectionDAG &DAG, unsigned Depth) const {
567  switch (Op.getOpcode()) {
568  default:
569    break;
570  case AArch64ISD::CSEL: {
571    APInt KnownZero2, KnownOne2;
572    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
573    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
574    KnownZero &= KnownZero2;
575    KnownOne &= KnownOne2;
576    break;
577  }
578  case ISD::INTRINSIC_W_CHAIN: {
579   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
580    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
581    switch (IntID) {
582    default: return;
583    case Intrinsic::aarch64_ldaxr:
584    case Intrinsic::aarch64_ldxr: {
585      unsigned BitWidth = KnownOne.getBitWidth();
586      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
587      unsigned MemBits = VT.getScalarType().getSizeInBits();
588      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
589      return;
590    }
591    }
592    break;
593  }
594  case ISD::INTRINSIC_WO_CHAIN:
595  case ISD::INTRINSIC_VOID: {
596    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
597    switch (IntNo) {
598    default:
599      break;
600    case Intrinsic::aarch64_neon_umaxv:
601    case Intrinsic::aarch64_neon_uminv: {
602      // Figure out the datatype of the vector operand. The UMINV instruction
603      // will zero extend the result, so we can mark as known zero all the
604      // bits larger than the element datatype. 32-bit or larget doesn't need
605      // this as those are legal types and will be handled by isel directly.
606      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
607      unsigned BitWidth = KnownZero.getBitWidth();
608      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
609        assert(BitWidth >= 8 && "Unexpected width!");
610        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
611        KnownZero |= Mask;
612      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
613        assert(BitWidth >= 16 && "Unexpected width!");
614        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
615        KnownZero |= Mask;
616      }
617      break;
618    } break;
619    }
620  }
621  }
622}
623
624MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
625  return MVT::i64;
626}
627
628unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
629  // FIXME: On AArch64, this depends on the type.
630  // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
631  // and the offset has to be a multiple of the related size in bytes.
632  return 4095;
633}
634
635FastISel *
636AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
637                                      const TargetLibraryInfo *libInfo) const {
638  return AArch64::createFastISel(funcInfo, libInfo);
639}
640
641const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
642  switch (Opcode) {
643  default:
644    return nullptr;
645  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
646  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
647  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
648  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
649  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
650  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
651  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
652  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
653  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
654  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
655  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
656  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
657  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
658  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
659  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
660  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
661  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
662  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
663  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
664  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
665  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
666  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
667  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
668  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
669  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
670  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
671  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
672  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
673  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
674  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
675  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
676  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
677  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
678  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
679  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
680  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
681  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
682  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
683  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
684  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
685  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
686  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
687  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
688  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
689  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
690  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
691  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
692  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
693  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
694  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
695  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
696  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
697  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
698  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
699  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
700  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
701  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
702  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
703  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
704  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
705  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
706  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
707  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
708  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
709  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
710  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
711  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
712  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
713  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
714  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
715  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
716  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
717  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
718  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
719  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
720  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
721  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
722  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
723  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
724  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
725  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
726  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
727  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
728  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
729  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
730  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
731  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
732  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
733  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
734  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
735  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
736  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
737  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
738  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
739  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
740  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
741  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
742  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
743  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
744  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
745  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
746  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
747  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
748  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
749  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
750  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
751  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
752  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
753  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
754  }
755}
756
757MachineBasicBlock *
758AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
759                                    MachineBasicBlock *MBB) const {
760  // We materialise the F128CSEL pseudo-instruction as some control flow and a
761  // phi node:
762
763  // OrigBB:
764  //     [... previous instrs leading to comparison ...]
765  //     b.ne TrueBB
766  //     b EndBB
767  // TrueBB:
768  //     ; Fallthrough
769  // EndBB:
770  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
771
772  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
773  MachineFunction *MF = MBB->getParent();
774  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
775  DebugLoc DL = MI->getDebugLoc();
776  MachineFunction::iterator It = MBB;
777  ++It;
778
779  unsigned DestReg = MI->getOperand(0).getReg();
780  unsigned IfTrueReg = MI->getOperand(1).getReg();
781  unsigned IfFalseReg = MI->getOperand(2).getReg();
782  unsigned CondCode = MI->getOperand(3).getImm();
783  bool NZCVKilled = MI->getOperand(4).isKill();
784
785  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
786  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
787  MF->insert(It, TrueBB);
788  MF->insert(It, EndBB);
789
790  // Transfer rest of current basic-block to EndBB
791  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
792                MBB->end());
793  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
794
795  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
796  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
797  MBB->addSuccessor(TrueBB);
798  MBB->addSuccessor(EndBB);
799
800  // TrueBB falls through to the end.
801  TrueBB->addSuccessor(EndBB);
802
803  if (!NZCVKilled) {
804    TrueBB->addLiveIn(AArch64::NZCV);
805    EndBB->addLiveIn(AArch64::NZCV);
806  }
807
808  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
809      .addReg(IfTrueReg)
810      .addMBB(TrueBB)
811      .addReg(IfFalseReg)
812      .addMBB(MBB);
813
814  MI->eraseFromParent();
815  return EndBB;
816}
817
818MachineBasicBlock *
819AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
820                                                 MachineBasicBlock *BB) const {
821  switch (MI->getOpcode()) {
822  default:
823#ifndef NDEBUG
824    MI->dump();
825#endif
826    llvm_unreachable("Unexpected instruction for custom inserter!");
827
828  case AArch64::F128CSEL:
829    return EmitF128CSEL(MI, BB);
830
831  case TargetOpcode::STACKMAP:
832  case TargetOpcode::PATCHPOINT:
833    return emitPatchPoint(MI, BB);
834  }
835}
836
837//===----------------------------------------------------------------------===//
838// AArch64 Lowering private implementation.
839//===----------------------------------------------------------------------===//
840
841//===----------------------------------------------------------------------===//
842// Lowering Code
843//===----------------------------------------------------------------------===//
844
845/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
846/// CC
847static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
848  switch (CC) {
849  default:
850    llvm_unreachable("Unknown condition code!");
851  case ISD::SETNE:
852    return AArch64CC::NE;
853  case ISD::SETEQ:
854    return AArch64CC::EQ;
855  case ISD::SETGT:
856    return AArch64CC::GT;
857  case ISD::SETGE:
858    return AArch64CC::GE;
859  case ISD::SETLT:
860    return AArch64CC::LT;
861  case ISD::SETLE:
862    return AArch64CC::LE;
863  case ISD::SETUGT:
864    return AArch64CC::HI;
865  case ISD::SETUGE:
866    return AArch64CC::HS;
867  case ISD::SETULT:
868    return AArch64CC::LO;
869  case ISD::SETULE:
870    return AArch64CC::LS;
871  }
872}
873
874/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
875static void changeFPCCToAArch64CC(ISD::CondCode CC,
876                                  AArch64CC::CondCode &CondCode,
877                                  AArch64CC::CondCode &CondCode2) {
878  CondCode2 = AArch64CC::AL;
879  switch (CC) {
880  default:
881    llvm_unreachable("Unknown FP condition!");
882  case ISD::SETEQ:
883  case ISD::SETOEQ:
884    CondCode = AArch64CC::EQ;
885    break;
886  case ISD::SETGT:
887  case ISD::SETOGT:
888    CondCode = AArch64CC::GT;
889    break;
890  case ISD::SETGE:
891  case ISD::SETOGE:
892    CondCode = AArch64CC::GE;
893    break;
894  case ISD::SETOLT:
895    CondCode = AArch64CC::MI;
896    break;
897  case ISD::SETOLE:
898    CondCode = AArch64CC::LS;
899    break;
900  case ISD::SETONE:
901    CondCode = AArch64CC::MI;
902    CondCode2 = AArch64CC::GT;
903    break;
904  case ISD::SETO:
905    CondCode = AArch64CC::VC;
906    break;
907  case ISD::SETUO:
908    CondCode = AArch64CC::VS;
909    break;
910  case ISD::SETUEQ:
911    CondCode = AArch64CC::EQ;
912    CondCode2 = AArch64CC::VS;
913    break;
914  case ISD::SETUGT:
915    CondCode = AArch64CC::HI;
916    break;
917  case ISD::SETUGE:
918    CondCode = AArch64CC::PL;
919    break;
920  case ISD::SETLT:
921  case ISD::SETULT:
922    CondCode = AArch64CC::LT;
923    break;
924  case ISD::SETLE:
925  case ISD::SETULE:
926    CondCode = AArch64CC::LE;
927    break;
928  case ISD::SETNE:
929  case ISD::SETUNE:
930    CondCode = AArch64CC::NE;
931    break;
932  }
933}
934
935/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
936/// CC usable with the vector instructions. Fewer operations are available
937/// without a real NZCV register, so we have to use less efficient combinations
938/// to get the same effect.
939static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
940                                        AArch64CC::CondCode &CondCode,
941                                        AArch64CC::CondCode &CondCode2,
942                                        bool &Invert) {
943  Invert = false;
944  switch (CC) {
945  default:
946    // Mostly the scalar mappings work fine.
947    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
948    break;
949  case ISD::SETUO:
950    Invert = true; // Fallthrough
951  case ISD::SETO:
952    CondCode = AArch64CC::MI;
953    CondCode2 = AArch64CC::GE;
954    break;
955  case ISD::SETUEQ:
956  case ISD::SETULT:
957  case ISD::SETULE:
958  case ISD::SETUGT:
959  case ISD::SETUGE:
960    // All of the compare-mask comparisons are ordered, but we can switch
961    // between the two by a double inversion. E.g. ULE == !OGT.
962    Invert = true;
963    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
964    break;
965  }
966}
967
968static bool isLegalArithImmed(uint64_t C) {
969  // Matches AArch64DAGToDAGISel::SelectArithImmed().
970  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
971}
972
973static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
974                              SDLoc dl, SelectionDAG &DAG) {
975  EVT VT = LHS.getValueType();
976
977  if (VT.isFloatingPoint())
978    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
979
980  // The CMP instruction is just an alias for SUBS, and representing it as
981  // SUBS means that it's possible to get CSE with subtract operations.
982  // A later phase can perform the optimization of setting the destination
983  // register to WZR/XZR if it ends up being unused.
984  unsigned Opcode = AArch64ISD::SUBS;
985
986  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
987      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
988      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
989    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
990    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
991    // can be set differently by this operation. It comes down to whether
992    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
993    // everything is fine. If not then the optimization is wrong. Thus general
994    // comparisons are only valid if op2 != 0.
995
996    // So, finally, the only LLVM-native comparisons that don't mention C and V
997    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
998    // the absence of information about op2.
999    Opcode = AArch64ISD::ADDS;
1000    RHS = RHS.getOperand(1);
1001  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
1002             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
1003             !isUnsignedIntSetCC(CC)) {
1004    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1005    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1006    // of the signed comparisons.
1007    Opcode = AArch64ISD::ANDS;
1008    RHS = LHS.getOperand(1);
1009    LHS = LHS.getOperand(0);
1010  }
1011
1012  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
1013      .getValue(1);
1014}
1015
1016static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
1017                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
1018  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1019    EVT VT = RHS.getValueType();
1020    uint64_t C = RHSC->getZExtValue();
1021    if (!isLegalArithImmed(C)) {
1022      // Constant does not fit, try adjusting it by one?
1023      switch (CC) {
1024      default:
1025        break;
1026      case ISD::SETLT:
1027      case ISD::SETGE:
1028        if ((VT == MVT::i32 && C != 0x80000000 &&
1029             isLegalArithImmed((uint32_t)(C - 1))) ||
1030            (VT == MVT::i64 && C != 0x80000000ULL &&
1031             isLegalArithImmed(C - 1ULL))) {
1032          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1033          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1034          RHS = DAG.getConstant(C, VT);
1035        }
1036        break;
1037      case ISD::SETULT:
1038      case ISD::SETUGE:
1039        if ((VT == MVT::i32 && C != 0 &&
1040             isLegalArithImmed((uint32_t)(C - 1))) ||
1041            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1042          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1043          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1044          RHS = DAG.getConstant(C, VT);
1045        }
1046        break;
1047      case ISD::SETLE:
1048      case ISD::SETGT:
1049        if ((VT == MVT::i32 && C != 0x7fffffff &&
1050             isLegalArithImmed((uint32_t)(C + 1))) ||
1051            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
1052             isLegalArithImmed(C + 1ULL))) {
1053          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1054          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1055          RHS = DAG.getConstant(C, VT);
1056        }
1057        break;
1058      case ISD::SETULE:
1059      case ISD::SETUGT:
1060        if ((VT == MVT::i32 && C != 0xffffffff &&
1061             isLegalArithImmed((uint32_t)(C + 1))) ||
1062            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
1063             isLegalArithImmed(C + 1ULL))) {
1064          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1065          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1066          RHS = DAG.getConstant(C, VT);
1067        }
1068        break;
1069      }
1070    }
1071  }
1072
1073  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1074  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
1075  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
1076  return Cmp;
1077}
1078
1079static std::pair<SDValue, SDValue>
1080getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
1081  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1082         "Unsupported value type");
1083  SDValue Value, Overflow;
1084  SDLoc DL(Op);
1085  SDValue LHS = Op.getOperand(0);
1086  SDValue RHS = Op.getOperand(1);
1087  unsigned Opc = 0;
1088  switch (Op.getOpcode()) {
1089  default:
1090    llvm_unreachable("Unknown overflow instruction!");
1091  case ISD::SADDO:
1092    Opc = AArch64ISD::ADDS;
1093    CC = AArch64CC::VS;
1094    break;
1095  case ISD::UADDO:
1096    Opc = AArch64ISD::ADDS;
1097    CC = AArch64CC::HS;
1098    break;
1099  case ISD::SSUBO:
1100    Opc = AArch64ISD::SUBS;
1101    CC = AArch64CC::VS;
1102    break;
1103  case ISD::USUBO:
1104    Opc = AArch64ISD::SUBS;
1105    CC = AArch64CC::LO;
1106    break;
1107  // Multiply needs a little bit extra work.
1108  case ISD::SMULO:
1109  case ISD::UMULO: {
1110    CC = AArch64CC::NE;
1111    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
1112    if (Op.getValueType() == MVT::i32) {
1113      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1114      // For a 32 bit multiply with overflow check we want the instruction
1115      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1116      // need to generate the following pattern:
1117      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1118      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1119      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1120      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1121      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1122                                DAG.getConstant(0, MVT::i64));
1123      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1124      // operation. We need to clear out the upper 32 bits, because we used a
1125      // widening multiply that wrote all 64 bits. In the end this should be a
1126      // noop.
1127      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1128      if (IsSigned) {
1129        // The signed overflow check requires more than just a simple check for
1130        // any bit set in the upper 32 bits of the result. These bits could be
1131        // just the sign bits of a negative number. To perform the overflow
1132        // check we have to arithmetic shift right the 32nd bit of the result by
1133        // 31 bits. Then we compare the result to the upper 32 bits.
1134        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1135                                        DAG.getConstant(32, MVT::i64));
1136        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1137        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1138                                        DAG.getConstant(31, MVT::i64));
1139        // It is important that LowerBits is last, otherwise the arithmetic
1140        // shift will not be folded into the compare (SUBS).
1141        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1142        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1143                       .getValue(1);
1144      } else {
1145        // The overflow check for unsigned multiply is easy. We only need to
1146        // check if any of the upper 32 bits are set. This can be done with a
1147        // CMP (shifted register). For that we need to generate the following
1148        // pattern:
1149        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1150        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1151                                        DAG.getConstant(32, MVT::i64));
1152        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1153        Overflow =
1154            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
1155                        UpperBits).getValue(1);
1156      }
1157      break;
1158    }
1159    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1160    // For the 64 bit multiply
1161    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1162    if (IsSigned) {
1163      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1164      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1165                                      DAG.getConstant(63, MVT::i64));
1166      // It is important that LowerBits is last, otherwise the arithmetic
1167      // shift will not be folded into the compare (SUBS).
1168      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1169      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1170                     .getValue(1);
1171    } else {
1172      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1173      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1174      Overflow =
1175          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
1176                      UpperBits).getValue(1);
1177    }
1178    break;
1179  }
1180  } // switch (...)
1181
1182  if (Opc) {
1183    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1184
1185    // Emit the AArch64 operation with overflow check.
1186    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1187    Overflow = Value.getValue(1);
1188  }
1189  return std::make_pair(Value, Overflow);
1190}
1191
1192SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1193                                             RTLIB::Libcall Call) const {
1194  SmallVector<SDValue, 2> Ops;
1195  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
1196    Ops.push_back(Op.getOperand(i));
1197
1198  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
1199                     SDLoc(Op)).first;
1200}
1201
1202static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
1203  SDValue Sel = Op.getOperand(0);
1204  SDValue Other = Op.getOperand(1);
1205
1206  // If neither operand is a SELECT_CC, give up.
1207  if (Sel.getOpcode() != ISD::SELECT_CC)
1208    std::swap(Sel, Other);
1209  if (Sel.getOpcode() != ISD::SELECT_CC)
1210    return Op;
1211
1212  // The folding we want to perform is:
1213  // (xor x, (select_cc a, b, cc, 0, -1) )
1214  //   -->
1215  // (csel x, (xor x, -1), cc ...)
1216  //
1217  // The latter will get matched to a CSINV instruction.
1218
1219  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
1220  SDValue LHS = Sel.getOperand(0);
1221  SDValue RHS = Sel.getOperand(1);
1222  SDValue TVal = Sel.getOperand(2);
1223  SDValue FVal = Sel.getOperand(3);
1224  SDLoc dl(Sel);
1225
1226  // FIXME: This could be generalized to non-integer comparisons.
1227  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
1228    return Op;
1229
1230  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
1231  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
1232
1233  // The the values aren't constants, this isn't the pattern we're looking for.
1234  if (!CFVal || !CTVal)
1235    return Op;
1236
1237  // We can commute the SELECT_CC by inverting the condition.  This
1238  // might be needed to make this fit into a CSINV pattern.
1239  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
1240    std::swap(TVal, FVal);
1241    std::swap(CTVal, CFVal);
1242    CC = ISD::getSetCCInverse(CC, true);
1243  }
1244
1245  // If the constants line up, perform the transform!
1246  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
1247    SDValue CCVal;
1248    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
1249
1250    FVal = Other;
1251    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
1252                       DAG.getConstant(-1ULL, Other.getValueType()));
1253
1254    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
1255                       CCVal, Cmp);
1256  }
1257
1258  return Op;
1259}
1260
1261static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
1262  EVT VT = Op.getValueType();
1263
1264  // Let legalize expand this if it isn't a legal type yet.
1265  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
1266    return SDValue();
1267
1268  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
1269
1270  unsigned Opc;
1271  bool ExtraOp = false;
1272  switch (Op.getOpcode()) {
1273  default:
1274    llvm_unreachable("Invalid code");
1275  case ISD::ADDC:
1276    Opc = AArch64ISD::ADDS;
1277    break;
1278  case ISD::SUBC:
1279    Opc = AArch64ISD::SUBS;
1280    break;
1281  case ISD::ADDE:
1282    Opc = AArch64ISD::ADCS;
1283    ExtraOp = true;
1284    break;
1285  case ISD::SUBE:
1286    Opc = AArch64ISD::SBCS;
1287    ExtraOp = true;
1288    break;
1289  }
1290
1291  if (!ExtraOp)
1292    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
1293  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
1294                     Op.getOperand(2));
1295}
1296
1297static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
1298  // Let legalize expand this if it isn't a legal type yet.
1299  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
1300    return SDValue();
1301
1302  AArch64CC::CondCode CC;
1303  // The actual operation that sets the overflow or carry flag.
1304  SDValue Value, Overflow;
1305  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
1306
1307  // We use 0 and 1 as false and true values.
1308  SDValue TVal = DAG.getConstant(1, MVT::i32);
1309  SDValue FVal = DAG.getConstant(0, MVT::i32);
1310
1311  // We use an inverted condition, because the conditional select is inverted
1312  // too. This will allow it to be selected to a single instruction:
1313  // CSINC Wd, WZR, WZR, invert(cond).
1314  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
1315  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
1316                         CCVal, Overflow);
1317
1318  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
1319  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
1320}
1321
1322// Prefetch operands are:
1323// 1: Address to prefetch
1324// 2: bool isWrite
1325// 3: int locality (0 = no locality ... 3 = extreme locality)
1326// 4: bool isDataCache
1327static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
1328  SDLoc DL(Op);
1329  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
1330  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
1331  // The data thing is not used.
1332  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
1333
1334  bool IsStream = !Locality;
1335  // When the locality number is set
1336  if (Locality) {
1337    // The front-end should have filtered out the out-of-range values
1338    assert(Locality <= 3 && "Prefetch locality out-of-range");
1339    // The locality degree is the opposite of the cache speed.
1340    // Put the number the other way around.
1341    // The encoding starts at 0 for level 1
1342    Locality = 3 - Locality;
1343  }
1344
1345  // built the mask value encoding the expected behavior.
1346  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
1347                   (Locality << 1) |    // Cache level bits
1348                   (unsigned)IsStream;  // Stream bit
1349  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
1350                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
1351}
1352
1353SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
1354                                              SelectionDAG &DAG) const {
1355  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
1356
1357  RTLIB::Libcall LC;
1358  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
1359
1360  return LowerF128Call(Op, DAG, LC);
1361}
1362
1363SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
1364                                             SelectionDAG &DAG) const {
1365  if (Op.getOperand(0).getValueType() != MVT::f128) {
1366    // It's legal except when f128 is involved
1367    return Op;
1368  }
1369
1370  RTLIB::Libcall LC;
1371  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
1372
1373  // FP_ROUND node has a second operand indicating whether it is known to be
1374  // precise. That doesn't take part in the LibCall so we can't directly use
1375  // LowerF128Call.
1376  SDValue SrcVal = Op.getOperand(0);
1377  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
1378                     /*isSigned*/ false, SDLoc(Op)).first;
1379}
1380
1381static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
1382  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
1383  // Any additional optimization in this function should be recorded
1384  // in the cost tables.
1385  EVT InVT = Op.getOperand(0).getValueType();
1386  EVT VT = Op.getValueType();
1387
1388  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
1389    SDLoc dl(Op);
1390    SDValue Cv =
1391        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
1392                    Op.getOperand(0));
1393    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
1394  }
1395
1396  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
1397    SDLoc dl(Op);
1398    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
1399    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
1400  }
1401
1402  // Type changing conversions are illegal.
1403  return Op;
1404}
1405
1406SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
1407                                              SelectionDAG &DAG) const {
1408  if (Op.getOperand(0).getValueType().isVector())
1409    return LowerVectorFP_TO_INT(Op, DAG);
1410
1411  if (Op.getOperand(0).getValueType() != MVT::f128) {
1412    // It's legal except when f128 is involved
1413    return Op;
1414  }
1415
1416  RTLIB::Libcall LC;
1417  if (Op.getOpcode() == ISD::FP_TO_SINT)
1418    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
1419  else
1420    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
1421
1422  SmallVector<SDValue, 2> Ops;
1423  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
1424    Ops.push_back(Op.getOperand(i));
1425
1426  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
1427                     SDLoc(Op)).first;
1428}
1429
1430static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
1431  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
1432  // Any additional optimization in this function should be recorded
1433  // in the cost tables.
1434  EVT VT = Op.getValueType();
1435  SDLoc dl(Op);
1436  SDValue In = Op.getOperand(0);
1437  EVT InVT = In.getValueType();
1438
1439  if (VT.getSizeInBits() < InVT.getSizeInBits()) {
1440    MVT CastVT =
1441        MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
1442                         InVT.getVectorNumElements());
1443    In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
1444    return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
1445  }
1446
1447  if (VT.getSizeInBits() > InVT.getSizeInBits()) {
1448    unsigned CastOpc =
1449        Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1450    EVT CastVT = VT.changeVectorElementTypeToInteger();
1451    In = DAG.getNode(CastOpc, dl, CastVT, In);
1452    return DAG.getNode(Op.getOpcode(), dl, VT, In);
1453  }
1454
1455  return Op;
1456}
1457
1458SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
1459                                            SelectionDAG &DAG) const {
1460  if (Op.getValueType().isVector())
1461    return LowerVectorINT_TO_FP(Op, DAG);
1462
1463  // i128 conversions are libcalls.
1464  if (Op.getOperand(0).getValueType() == MVT::i128)
1465    return SDValue();
1466
1467  // Other conversions are legal, unless it's to the completely software-based
1468  // fp128.
1469  if (Op.getValueType() != MVT::f128)
1470    return Op;
1471
1472  RTLIB::Libcall LC;
1473  if (Op.getOpcode() == ISD::SINT_TO_FP)
1474    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
1475  else
1476    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
1477
1478  return LowerF128Call(Op, DAG, LC);
1479}
1480
1481SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
1482                                            SelectionDAG &DAG) const {
1483  // For iOS, we want to call an alternative entry point: __sincos_stret,
1484  // which returns the values in two S / D registers.
1485  SDLoc dl(Op);
1486  SDValue Arg = Op.getOperand(0);
1487  EVT ArgVT = Arg.getValueType();
1488  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
1489
1490  ArgListTy Args;
1491  ArgListEntry Entry;
1492
1493  Entry.Node = Arg;
1494  Entry.Ty = ArgTy;
1495  Entry.isSExt = false;
1496  Entry.isZExt = false;
1497  Args.push_back(Entry);
1498
1499  const char *LibcallName =
1500      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
1501  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
1502
1503  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
1504  TargetLowering::CallLoweringInfo CLI(DAG);
1505  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
1506    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
1507
1508  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
1509  return CallResult.first;
1510}
1511
1512SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
1513                                              SelectionDAG &DAG) const {
1514  switch (Op.getOpcode()) {
1515  default:
1516    llvm_unreachable("unimplemented operand");
1517    return SDValue();
1518  case ISD::GlobalAddress:
1519    return LowerGlobalAddress(Op, DAG);
1520  case ISD::GlobalTLSAddress:
1521    return LowerGlobalTLSAddress(Op, DAG);
1522  case ISD::SETCC:
1523    return LowerSETCC(Op, DAG);
1524  case ISD::BR_CC:
1525    return LowerBR_CC(Op, DAG);
1526  case ISD::SELECT:
1527    return LowerSELECT(Op, DAG);
1528  case ISD::SELECT_CC:
1529    return LowerSELECT_CC(Op, DAG);
1530  case ISD::JumpTable:
1531    return LowerJumpTable(Op, DAG);
1532  case ISD::ConstantPool:
1533    return LowerConstantPool(Op, DAG);
1534  case ISD::BlockAddress:
1535    return LowerBlockAddress(Op, DAG);
1536  case ISD::VASTART:
1537    return LowerVASTART(Op, DAG);
1538  case ISD::VACOPY:
1539    return LowerVACOPY(Op, DAG);
1540  case ISD::VAARG:
1541    return LowerVAARG(Op, DAG);
1542  case ISD::ADDC:
1543  case ISD::ADDE:
1544  case ISD::SUBC:
1545  case ISD::SUBE:
1546    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
1547  case ISD::SADDO:
1548  case ISD::UADDO:
1549  case ISD::SSUBO:
1550  case ISD::USUBO:
1551  case ISD::SMULO:
1552  case ISD::UMULO:
1553    return LowerXALUO(Op, DAG);
1554  case ISD::FADD:
1555    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
1556  case ISD::FSUB:
1557    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
1558  case ISD::FMUL:
1559    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
1560  case ISD::FDIV:
1561    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
1562  case ISD::FP_ROUND:
1563    return LowerFP_ROUND(Op, DAG);
1564  case ISD::FP_EXTEND:
1565    return LowerFP_EXTEND(Op, DAG);
1566  case ISD::FRAMEADDR:
1567    return LowerFRAMEADDR(Op, DAG);
1568  case ISD::RETURNADDR:
1569    return LowerRETURNADDR(Op, DAG);
1570  case ISD::INSERT_VECTOR_ELT:
1571    return LowerINSERT_VECTOR_ELT(Op, DAG);
1572  case ISD::EXTRACT_VECTOR_ELT:
1573    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
1574  case ISD::BUILD_VECTOR:
1575    return LowerBUILD_VECTOR(Op, DAG);
1576  case ISD::VECTOR_SHUFFLE:
1577    return LowerVECTOR_SHUFFLE(Op, DAG);
1578  case ISD::EXTRACT_SUBVECTOR:
1579    return LowerEXTRACT_SUBVECTOR(Op, DAG);
1580  case ISD::SRA:
1581  case ISD::SRL:
1582  case ISD::SHL:
1583    return LowerVectorSRA_SRL_SHL(Op, DAG);
1584  case ISD::SHL_PARTS:
1585    return LowerShiftLeftParts(Op, DAG);
1586  case ISD::SRL_PARTS:
1587  case ISD::SRA_PARTS:
1588    return LowerShiftRightParts(Op, DAG);
1589  case ISD::CTPOP:
1590    return LowerCTPOP(Op, DAG);
1591  case ISD::FCOPYSIGN:
1592    return LowerFCOPYSIGN(Op, DAG);
1593  case ISD::AND:
1594    return LowerVectorAND(Op, DAG);
1595  case ISD::OR:
1596    return LowerVectorOR(Op, DAG);
1597  case ISD::XOR:
1598    return LowerXOR(Op, DAG);
1599  case ISD::PREFETCH:
1600    return LowerPREFETCH(Op, DAG);
1601  case ISD::SINT_TO_FP:
1602  case ISD::UINT_TO_FP:
1603    return LowerINT_TO_FP(Op, DAG);
1604  case ISD::FP_TO_SINT:
1605  case ISD::FP_TO_UINT:
1606    return LowerFP_TO_INT(Op, DAG);
1607  case ISD::FSINCOS:
1608    return LowerFSINCOS(Op, DAG);
1609  }
1610}
1611
1612/// getFunctionAlignment - Return the Log2 alignment of this function.
1613unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
1614  return 2;
1615}
1616
1617//===----------------------------------------------------------------------===//
1618//                      Calling Convention Implementation
1619//===----------------------------------------------------------------------===//
1620
1621#include "AArch64GenCallingConv.inc"
1622
1623/// Selects the correct CCAssignFn for a the given CallingConvention
1624/// value.
1625CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1626                                                     bool IsVarArg) const {
1627  switch (CC) {
1628  default:
1629    llvm_unreachable("Unsupported calling convention.");
1630  case CallingConv::WebKit_JS:
1631    return CC_AArch64_WebKit_JS;
1632  case CallingConv::C:
1633  case CallingConv::Fast:
1634    if (!Subtarget->isTargetDarwin())
1635      return CC_AArch64_AAPCS;
1636    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
1637  }
1638}
1639
1640SDValue AArch64TargetLowering::LowerFormalArguments(
1641    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1642    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
1643    SmallVectorImpl<SDValue> &InVals) const {
1644  MachineFunction &MF = DAG.getMachineFunction();
1645  MachineFrameInfo *MFI = MF.getFrameInfo();
1646
1647  // Assign locations to all of the incoming arguments.
1648  SmallVector<CCValAssign, 16> ArgLocs;
1649  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1650                 getTargetMachine(), ArgLocs, *DAG.getContext());
1651
1652  // At this point, Ins[].VT may already be promoted to i32. To correctly
1653  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
1654  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
1655  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
1656  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
1657  // LocVT.
1658  unsigned NumArgs = Ins.size();
1659  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
1660  unsigned CurArgIdx = 0;
1661  for (unsigned i = 0; i != NumArgs; ++i) {
1662    MVT ValVT = Ins[i].VT;
1663    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
1664    CurArgIdx = Ins[i].OrigArgIndex;
1665
1666    // Get type of the original argument.
1667    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
1668    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
1669    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
1670    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
1671      ValVT = MVT::i8;
1672    else if (ActualMVT == MVT::i16)
1673      ValVT = MVT::i16;
1674
1675    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
1676    bool Res =
1677        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
1678    assert(!Res && "Call operand has unhandled type");
1679    (void)Res;
1680  }
1681  assert(ArgLocs.size() == Ins.size());
1682  SmallVector<SDValue, 16> ArgValues;
1683  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1684    CCValAssign &VA = ArgLocs[i];
1685
1686    if (Ins[i].Flags.isByVal()) {
1687      // Byval is used for HFAs in the PCS, but the system should work in a
1688      // non-compliant manner for larger structs.
1689      EVT PtrTy = getPointerTy();
1690      int Size = Ins[i].Flags.getByValSize();
1691      unsigned NumRegs = (Size + 7) / 8;
1692
1693      // FIXME: This works on big-endian for composite byvals, which are the common
1694      // case. It should also work for fundamental types too.
1695      unsigned FrameIdx =
1696        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
1697      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
1698      InVals.push_back(FrameIdxN);
1699
1700      continue;
1701    }
1702
1703    if (VA.isRegLoc()) {
1704      // Arguments stored in registers.
1705      EVT RegVT = VA.getLocVT();
1706
1707      SDValue ArgValue;
1708      const TargetRegisterClass *RC;
1709
1710      if (RegVT == MVT::i32)
1711        RC = &AArch64::GPR32RegClass;
1712      else if (RegVT == MVT::i64)
1713        RC = &AArch64::GPR64RegClass;
1714      else if (RegVT == MVT::f32)
1715        RC = &AArch64::FPR32RegClass;
1716      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
1717        RC = &AArch64::FPR64RegClass;
1718      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
1719        RC = &AArch64::FPR128RegClass;
1720      else
1721        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
1722
1723      // Transform the arguments in physical registers into virtual ones.
1724      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1725      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
1726
1727      // If this is an 8, 16 or 32-bit value, it is really passed promoted
1728      // to 64 bits.  Insert an assert[sz]ext to capture this, then
1729      // truncate to the right size.
1730      switch (VA.getLocInfo()) {
1731      default:
1732        llvm_unreachable("Unknown loc info!");
1733      case CCValAssign::Full:
1734        break;
1735      case CCValAssign::BCvt:
1736        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
1737        break;
1738      case CCValAssign::AExt:
1739      case CCValAssign::SExt:
1740      case CCValAssign::ZExt:
1741        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
1742        // nodes after our lowering.
1743        assert(RegVT == Ins[i].VT && "incorrect register location selected");
1744        break;
1745      }
1746
1747      InVals.push_back(ArgValue);
1748
1749    } else { // VA.isRegLoc()
1750      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
1751      unsigned ArgOffset = VA.getLocMemOffset();
1752      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
1753
1754      uint32_t BEAlign = 0;
1755      if (ArgSize < 8 && !Subtarget->isLittleEndian())
1756        BEAlign = 8 - ArgSize;
1757
1758      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
1759
1760      // Create load nodes to retrieve arguments from the stack.
1761      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1762      SDValue ArgValue;
1763
1764      // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1765      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1766      MVT MemVT = VA.getValVT();
1767
1768      switch (VA.getLocInfo()) {
1769      default:
1770        break;
1771      case CCValAssign::BCvt:
1772        MemVT = VA.getLocVT();
1773        break;
1774      case CCValAssign::SExt:
1775        ExtType = ISD::SEXTLOAD;
1776        break;
1777      case CCValAssign::ZExt:
1778        ExtType = ISD::ZEXTLOAD;
1779        break;
1780      case CCValAssign::AExt:
1781        ExtType = ISD::EXTLOAD;
1782        break;
1783      }
1784
1785      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
1786                                MachinePointerInfo::getFixedStack(FI),
1787                                MemVT, false, false, false, nullptr);
1788
1789      InVals.push_back(ArgValue);
1790    }
1791  }
1792
1793  // varargs
1794  if (isVarArg) {
1795    if (!Subtarget->isTargetDarwin()) {
1796      // The AAPCS variadic function ABI is identical to the non-variadic
1797      // one. As a result there may be more arguments in registers and we should
1798      // save them for future reference.
1799      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
1800    }
1801
1802    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1803    // This will point to the next argument passed via stack.
1804    unsigned StackOffset = CCInfo.getNextStackOffset();
1805    // We currently pass all varargs at 8-byte alignment.
1806    StackOffset = ((StackOffset + 7) & ~7);
1807    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
1808  }
1809
1810  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1811  unsigned StackArgSize = CCInfo.getNextStackOffset();
1812  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1813  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
1814    // This is a non-standard ABI so by fiat I say we're allowed to make full
1815    // use of the stack area to be popped, which must be aligned to 16 bytes in
1816    // any case:
1817    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
1818
1819    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
1820    // a multiple of 16.
1821    FuncInfo->setArgumentStackToRestore(StackArgSize);
1822
1823    // This realignment carries over to the available bytes below. Our own
1824    // callers will guarantee the space is free by giving an aligned value to
1825    // CALLSEQ_START.
1826  }
1827  // Even if we're not expected to free up the space, it's useful to know how
1828  // much is there while considering tail calls (because we can reuse it).
1829  FuncInfo->setBytesInStackArgArea(StackArgSize);
1830
1831  return Chain;
1832}
1833
1834void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
1835                                                SelectionDAG &DAG, SDLoc DL,
1836                                                SDValue &Chain) const {
1837  MachineFunction &MF = DAG.getMachineFunction();
1838  MachineFrameInfo *MFI = MF.getFrameInfo();
1839  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1840
1841  SmallVector<SDValue, 8> MemOps;
1842
1843  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
1844                                          AArch64::X3, AArch64::X4, AArch64::X5,
1845                                          AArch64::X6, AArch64::X7 };
1846  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
1847  unsigned FirstVariadicGPR =
1848      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
1849
1850  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
1851  int GPRIdx = 0;
1852  if (GPRSaveSize != 0) {
1853    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
1854
1855    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
1856
1857    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
1858      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
1859      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
1860      SDValue Store =
1861          DAG.getStore(Val.getValue(1), DL, Val, FIN,
1862                       MachinePointerInfo::getStack(i * 8), false, false, 0);
1863      MemOps.push_back(Store);
1864      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1865                        DAG.getConstant(8, getPointerTy()));
1866    }
1867  }
1868  FuncInfo->setVarArgsGPRIndex(GPRIdx);
1869  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
1870
1871  if (Subtarget->hasFPARMv8()) {
1872    static const MCPhysReg FPRArgRegs[] = {
1873        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
1874        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
1875    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
1876    unsigned FirstVariadicFPR =
1877        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
1878
1879    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
1880    int FPRIdx = 0;
1881    if (FPRSaveSize != 0) {
1882      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
1883
1884      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
1885
1886      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
1887        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
1888        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
1889
1890        SDValue Store =
1891            DAG.getStore(Val.getValue(1), DL, Val, FIN,
1892                         MachinePointerInfo::getStack(i * 16), false, false, 0);
1893        MemOps.push_back(Store);
1894        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1895                          DAG.getConstant(16, getPointerTy()));
1896      }
1897    }
1898    FuncInfo->setVarArgsFPRIndex(FPRIdx);
1899    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
1900  }
1901
1902  if (!MemOps.empty()) {
1903    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
1904  }
1905}
1906
1907/// LowerCallResult - Lower the result values of a call into the
1908/// appropriate copies out of appropriate physical registers.
1909SDValue AArch64TargetLowering::LowerCallResult(
1910    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1911    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
1912    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1913    SDValue ThisVal) const {
1914  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
1915                          ? RetCC_AArch64_WebKit_JS
1916                          : RetCC_AArch64_AAPCS;
1917  // Assign locations to each value returned by this call.
1918  SmallVector<CCValAssign, 16> RVLocs;
1919  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1920                 getTargetMachine(), RVLocs, *DAG.getContext());
1921  CCInfo.AnalyzeCallResult(Ins, RetCC);
1922
1923  // Copy all of the result registers out of their specified physreg.
1924  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1925    CCValAssign VA = RVLocs[i];
1926
1927    // Pass 'this' value directly from the argument to return value, to avoid
1928    // reg unit interference
1929    if (i == 0 && isThisReturn) {
1930      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
1931             "unexpected return calling convention register assignment");
1932      InVals.push_back(ThisVal);
1933      continue;
1934    }
1935
1936    SDValue Val =
1937        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
1938    Chain = Val.getValue(1);
1939    InFlag = Val.getValue(2);
1940
1941    switch (VA.getLocInfo()) {
1942    default:
1943      llvm_unreachable("Unknown loc info!");
1944    case CCValAssign::Full:
1945      break;
1946    case CCValAssign::BCvt:
1947      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
1948      break;
1949    }
1950
1951    InVals.push_back(Val);
1952  }
1953
1954  return Chain;
1955}
1956
1957bool AArch64TargetLowering::isEligibleForTailCallOptimization(
1958    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
1959    bool isCalleeStructRet, bool isCallerStructRet,
1960    const SmallVectorImpl<ISD::OutputArg> &Outs,
1961    const SmallVectorImpl<SDValue> &OutVals,
1962    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
1963  // For CallingConv::C this function knows whether the ABI needs
1964  // changing. That's not true for other conventions so they will have to opt in
1965  // manually.
1966  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1967    return false;
1968
1969  const MachineFunction &MF = DAG.getMachineFunction();
1970  const Function *CallerF = MF.getFunction();
1971  CallingConv::ID CallerCC = CallerF->getCallingConv();
1972  bool CCMatch = CallerCC == CalleeCC;
1973
1974  // Byval parameters hand the function a pointer directly into the stack area
1975  // we want to reuse during a tail call. Working around this *is* possible (see
1976  // X86) but less efficient and uglier in LowerCall.
1977  for (Function::const_arg_iterator i = CallerF->arg_begin(),
1978                                    e = CallerF->arg_end();
1979       i != e; ++i)
1980    if (i->hasByValAttr())
1981      return false;
1982
1983  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
1984    if (IsTailCallConvention(CalleeCC) && CCMatch)
1985      return true;
1986    return false;
1987  }
1988
1989  // Now we search for cases where we can use a tail call without changing the
1990  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
1991  // concept.
1992
1993  // I want anyone implementing a new calling convention to think long and hard
1994  // about this assert.
1995  assert((!isVarArg || CalleeCC == CallingConv::C) &&
1996         "Unexpected variadic calling convention");
1997
1998  if (isVarArg && !Outs.empty()) {
1999    // At least two cases here: if caller is fastcc then we can't have any
2000    // memory arguments (we'd be expected to clean up the stack afterwards). If
2001    // caller is C then we could potentially use its argument area.
2002
2003    // FIXME: for now we take the most conservative of these in both cases:
2004    // disallow all variadic memory operands.
2005    SmallVector<CCValAssign, 16> ArgLocs;
2006    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2007                   getTargetMachine(), ArgLocs, *DAG.getContext());
2008
2009    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
2010    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
2011      if (!ArgLocs[i].isRegLoc())
2012        return false;
2013  }
2014
2015  // If the calling conventions do not match, then we'd better make sure the
2016  // results are returned in the same way as what the caller expects.
2017  if (!CCMatch) {
2018    SmallVector<CCValAssign, 16> RVLocs1;
2019    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
2020                    getTargetMachine(), RVLocs1, *DAG.getContext());
2021    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
2022
2023    SmallVector<CCValAssign, 16> RVLocs2;
2024    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
2025                    getTargetMachine(), RVLocs2, *DAG.getContext());
2026    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
2027
2028    if (RVLocs1.size() != RVLocs2.size())
2029      return false;
2030    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2031      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2032        return false;
2033      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2034        return false;
2035      if (RVLocs1[i].isRegLoc()) {
2036        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2037          return false;
2038      } else {
2039        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2040          return false;
2041      }
2042    }
2043  }
2044
2045  // Nothing more to check if the callee is taking no arguments
2046  if (Outs.empty())
2047    return true;
2048
2049  SmallVector<CCValAssign, 16> ArgLocs;
2050  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2051                 getTargetMachine(), ArgLocs, *DAG.getContext());
2052
2053  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2054
2055  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2056
2057  // If the stack arguments for this call would fit into our own save area then
2058  // the call can be made tail.
2059  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
2060}
2061
2062SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
2063                                                   SelectionDAG &DAG,
2064                                                   MachineFrameInfo *MFI,
2065                                                   int ClobberedFI) const {
2066  SmallVector<SDValue, 8> ArgChains;
2067  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
2068  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
2069
2070  // Include the original chain at the beginning of the list. When this is
2071  // used by target LowerCall hooks, this helps legalize find the
2072  // CALLSEQ_BEGIN node.
2073  ArgChains.push_back(Chain);
2074
2075  // Add a chain value for each stack argument corresponding
2076  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
2077                            UE = DAG.getEntryNode().getNode()->use_end();
2078       U != UE; ++U)
2079    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
2080      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
2081        if (FI->getIndex() < 0) {
2082          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
2083          int64_t InLastByte = InFirstByte;
2084          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
2085
2086          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
2087              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
2088            ArgChains.push_back(SDValue(L, 1));
2089        }
2090
2091  // Build a tokenfactor for all the chains.
2092  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
2093}
2094
2095bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
2096                                                   bool TailCallOpt) const {
2097  return CallCC == CallingConv::Fast && TailCallOpt;
2098}
2099
2100bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
2101  return CallCC == CallingConv::Fast;
2102}
2103
2104/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
2105/// and add input and output parameter nodes.
2106SDValue
2107AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
2108                                 SmallVectorImpl<SDValue> &InVals) const {
2109  SelectionDAG &DAG = CLI.DAG;
2110  SDLoc &DL = CLI.DL;
2111  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2112  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2113  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2114  SDValue Chain = CLI.Chain;
2115  SDValue Callee = CLI.Callee;
2116  bool &IsTailCall = CLI.IsTailCall;
2117  CallingConv::ID CallConv = CLI.CallConv;
2118  bool IsVarArg = CLI.IsVarArg;
2119
2120  MachineFunction &MF = DAG.getMachineFunction();
2121  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2122  bool IsThisReturn = false;
2123
2124  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2125  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2126  bool IsSibCall = false;
2127
2128  if (IsTailCall) {
2129    // Check if it's really possible to do a tail call.
2130    IsTailCall = isEligibleForTailCallOptimization(
2131        Callee, CallConv, IsVarArg, IsStructRet,
2132        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
2133    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
2134      report_fatal_error("failed to perform tail call elimination on a call "
2135                         "site marked musttail");
2136
2137    // A sibling call is one where we're under the usual C ABI and not planning
2138    // to change that but can still do a tail call:
2139    if (!TailCallOpt && IsTailCall)
2140      IsSibCall = true;
2141
2142    if (IsTailCall)
2143      ++NumTailCalls;
2144  }
2145
2146  // Analyze operands of the call, assigning locations to each operand.
2147  SmallVector<CCValAssign, 16> ArgLocs;
2148  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
2149                 getTargetMachine(), ArgLocs, *DAG.getContext());
2150
2151  if (IsVarArg) {
2152    // Handle fixed and variable vector arguments differently.
2153    // Variable vector arguments always go into memory.
2154    unsigned NumArgs = Outs.size();
2155
2156    for (unsigned i = 0; i != NumArgs; ++i) {
2157      MVT ArgVT = Outs[i].VT;
2158      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
2159      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
2160                                               /*IsVarArg=*/ !Outs[i].IsFixed);
2161      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
2162      assert(!Res && "Call operand has unhandled type");
2163      (void)Res;
2164    }
2165  } else {
2166    // At this point, Outs[].VT may already be promoted to i32. To correctly
2167    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2168    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2169    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
2170    // we use a special version of AnalyzeCallOperands to pass in ValVT and
2171    // LocVT.
2172    unsigned NumArgs = Outs.size();
2173    for (unsigned i = 0; i != NumArgs; ++i) {
2174      MVT ValVT = Outs[i].VT;
2175      // Get type of the original argument.
2176      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
2177                                  /*AllowUnknown*/ true);
2178      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
2179      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
2180      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2181      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2182        ValVT = MVT::i8;
2183      else if (ActualMVT == MVT::i16)
2184        ValVT = MVT::i16;
2185
2186      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2187      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
2188      assert(!Res && "Call operand has unhandled type");
2189      (void)Res;
2190    }
2191  }
2192
2193  // Get a count of how many bytes are to be pushed on the stack.
2194  unsigned NumBytes = CCInfo.getNextStackOffset();
2195
2196  if (IsSibCall) {
2197    // Since we're not changing the ABI to make this a tail call, the memory
2198    // operands are already available in the caller's incoming argument space.
2199    NumBytes = 0;
2200  }
2201
2202  // FPDiff is the byte offset of the call's argument area from the callee's.
2203  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2204  // by this amount for a tail call. In a sibling call it must be 0 because the
2205  // caller will deallocate the entire stack and the callee still expects its
2206  // arguments to begin at SP+0. Completely unused for non-tail calls.
2207  int FPDiff = 0;
2208
2209  if (IsTailCall && !IsSibCall) {
2210    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
2211
2212    // Since callee will pop argument stack as a tail call, we must keep the
2213    // popped size 16-byte aligned.
2214    NumBytes = RoundUpToAlignment(NumBytes, 16);
2215
2216    // FPDiff will be negative if this tail call requires more space than we
2217    // would automatically have in our incoming argument space. Positive if we
2218    // can actually shrink the stack.
2219    FPDiff = NumReusableBytes - NumBytes;
2220
2221    // The stack pointer must be 16-byte aligned at all times it's used for a
2222    // memory operation, which in practice means at *all* times and in
2223    // particular across call boundaries. Therefore our own arguments started at
2224    // a 16-byte aligned SP and the delta applied for the tail call should
2225    // satisfy the same constraint.
2226    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
2227  }
2228
2229  // Adjust the stack pointer for the new arguments...
2230  // These operations are automatically eliminated by the prolog/epilog pass
2231  if (!IsSibCall)
2232    Chain =
2233        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
2234
2235  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
2236
2237  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2238  SmallVector<SDValue, 8> MemOpChains;
2239
2240  // Walk the register/memloc assignments, inserting copies/loads.
2241  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2242       ++i, ++realArgIdx) {
2243    CCValAssign &VA = ArgLocs[i];
2244    SDValue Arg = OutVals[realArgIdx];
2245    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2246
2247    // Promote the value if needed.
2248    switch (VA.getLocInfo()) {
2249    default:
2250      llvm_unreachable("Unknown loc info!");
2251    case CCValAssign::Full:
2252      break;
2253    case CCValAssign::SExt:
2254      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2255      break;
2256    case CCValAssign::ZExt:
2257      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2258      break;
2259    case CCValAssign::AExt:
2260      if (Outs[realArgIdx].ArgVT == MVT::i1) {
2261        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
2262        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
2263        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
2264      }
2265      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2266      break;
2267    case CCValAssign::BCvt:
2268      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2269      break;
2270    case CCValAssign::FPExt:
2271      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2272      break;
2273    }
2274
2275    if (VA.isRegLoc()) {
2276      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
2277        assert(VA.getLocVT() == MVT::i64 &&
2278               "unexpected calling convention register assignment");
2279        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
2280               "unexpected use of 'returned'");
2281        IsThisReturn = true;
2282      }
2283      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2284    } else {
2285      assert(VA.isMemLoc());
2286
2287      SDValue DstAddr;
2288      MachinePointerInfo DstInfo;
2289
2290      // FIXME: This works on big-endian for composite byvals, which are the
2291      // common case. It should also work for fundamental types too.
2292      uint32_t BEAlign = 0;
2293      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
2294                                        : VA.getLocVT().getSizeInBits();
2295      OpSize = (OpSize + 7) / 8;
2296      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
2297        if (OpSize < 8)
2298          BEAlign = 8 - OpSize;
2299      }
2300      unsigned LocMemOffset = VA.getLocMemOffset();
2301      int32_t Offset = LocMemOffset + BEAlign;
2302      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
2303      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
2304
2305      if (IsTailCall) {
2306        Offset = Offset + FPDiff;
2307        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
2308
2309        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
2310        DstInfo = MachinePointerInfo::getFixedStack(FI);
2311
2312        // Make sure any stack arguments overlapping with where we're storing
2313        // are loaded before this eventual operation. Otherwise they'll be
2314        // clobbered.
2315        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
2316      } else {
2317        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
2318
2319        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
2320        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
2321      }
2322
2323      if (Outs[i].Flags.isByVal()) {
2324        SDValue SizeNode =
2325            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
2326        SDValue Cpy = DAG.getMemcpy(
2327            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2328            /*isVolatile = */ false,
2329            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
2330
2331        MemOpChains.push_back(Cpy);
2332      } else {
2333        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
2334        // promoted to a legal register type i32, we should truncate Arg back to
2335        // i1/i8/i16.
2336        if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
2337            VA.getValVT() == MVT::i16)
2338          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
2339
2340        SDValue Store =
2341            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
2342        MemOpChains.push_back(Store);
2343      }
2344    }
2345  }
2346
2347  if (!MemOpChains.empty())
2348    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2349
2350  // Build a sequence of copy-to-reg nodes chained together with token chain
2351  // and flag operands which copy the outgoing args into the appropriate regs.
2352  SDValue InFlag;
2353  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2354    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
2355                             RegsToPass[i].second, InFlag);
2356    InFlag = Chain.getValue(1);
2357  }
2358
2359  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2360  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2361  // node so that legalize doesn't hack it.
2362  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
2363      Subtarget->isTargetMachO()) {
2364    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2365      const GlobalValue *GV = G->getGlobal();
2366      bool InternalLinkage = GV->hasInternalLinkage();
2367      if (InternalLinkage)
2368        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
2369      else {
2370        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
2371                                            AArch64II::MO_GOT);
2372        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
2373      }
2374    } else if (ExternalSymbolSDNode *S =
2375                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
2376      const char *Sym = S->getSymbol();
2377      Callee =
2378          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
2379      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
2380    }
2381  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2382    const GlobalValue *GV = G->getGlobal();
2383    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
2384  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2385    const char *Sym = S->getSymbol();
2386    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
2387  }
2388
2389  // We don't usually want to end the call-sequence here because we would tidy
2390  // the frame up *after* the call, however in the ABI-changing tail-call case
2391  // we've carefully laid out the parameters so that when sp is reset they'll be
2392  // in the correct location.
2393  if (IsTailCall && !IsSibCall) {
2394    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2395                               DAG.getIntPtrConstant(0, true), InFlag, DL);
2396    InFlag = Chain.getValue(1);
2397  }
2398
2399  std::vector<SDValue> Ops;
2400  Ops.push_back(Chain);
2401  Ops.push_back(Callee);
2402
2403  if (IsTailCall) {
2404    // Each tail call may have to adjust the stack by a different amount, so
2405    // this information must travel along with the operation for eventual
2406    // consumption by emitEpilogue.
2407    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
2408  }
2409
2410  // Add argument registers to the end of the list so that they are known live
2411  // into the call.
2412  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2413    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2414                                  RegsToPass[i].second.getValueType()));
2415
2416  // Add a register mask operand representing the call-preserved registers.
2417  const uint32_t *Mask;
2418  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2419  const AArch64RegisterInfo *ARI =
2420      static_cast<const AArch64RegisterInfo *>(TRI);
2421  if (IsThisReturn) {
2422    // For 'this' returns, use the X0-preserving mask if applicable
2423    Mask = ARI->getThisReturnPreservedMask(CallConv);
2424    if (!Mask) {
2425      IsThisReturn = false;
2426      Mask = ARI->getCallPreservedMask(CallConv);
2427    }
2428  } else
2429    Mask = ARI->getCallPreservedMask(CallConv);
2430
2431  assert(Mask && "Missing call preserved mask for calling convention");
2432  Ops.push_back(DAG.getRegisterMask(Mask));
2433
2434  if (InFlag.getNode())
2435    Ops.push_back(InFlag);
2436
2437  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2438
2439  // If we're doing a tall call, use a TC_RETURN here rather than an
2440  // actual call instruction.
2441  if (IsTailCall)
2442    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
2443
2444  // Returns a chain and a flag for retval copy to use.
2445  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
2446  InFlag = Chain.getValue(1);
2447
2448  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
2449                                ? RoundUpToAlignment(NumBytes, 16)
2450                                : 0;
2451
2452  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2453                             DAG.getIntPtrConstant(CalleePopBytes, true),
2454                             InFlag, DL);
2455  if (!Ins.empty())
2456    InFlag = Chain.getValue(1);
2457
2458  // Handle result values, copying them out of physregs into vregs that we
2459  // return.
2460  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2461                         InVals, IsThisReturn,
2462                         IsThisReturn ? OutVals[0] : SDValue());
2463}
2464
2465bool AArch64TargetLowering::CanLowerReturn(
2466    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2467    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2468  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
2469                          ? RetCC_AArch64_WebKit_JS
2470                          : RetCC_AArch64_AAPCS;
2471  SmallVector<CCValAssign, 16> RVLocs;
2472  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
2473  return CCInfo.CheckReturn(Outs, RetCC);
2474}
2475
2476SDValue
2477AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2478                                   bool isVarArg,
2479                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2480                                   const SmallVectorImpl<SDValue> &OutVals,
2481                                   SDLoc DL, SelectionDAG &DAG) const {
2482  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
2483                          ? RetCC_AArch64_WebKit_JS
2484                          : RetCC_AArch64_AAPCS;
2485  SmallVector<CCValAssign, 16> RVLocs;
2486  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2487                 getTargetMachine(), RVLocs, *DAG.getContext());
2488  CCInfo.AnalyzeReturn(Outs, RetCC);
2489
2490  // Copy the result values into the output registers.
2491  SDValue Flag;
2492  SmallVector<SDValue, 4> RetOps(1, Chain);
2493  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
2494       ++i, ++realRVLocIdx) {
2495    CCValAssign &VA = RVLocs[i];
2496    assert(VA.isRegLoc() && "Can only return in registers!");
2497    SDValue Arg = OutVals[realRVLocIdx];
2498
2499    switch (VA.getLocInfo()) {
2500    default:
2501      llvm_unreachable("Unknown loc info!");
2502    case CCValAssign::Full:
2503      if (Outs[i].ArgVT == MVT::i1) {
2504        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
2505        // value. This is strictly redundant on Darwin (which uses "zeroext
2506        // i1"), but will be optimised out before ISel.
2507        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
2508        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2509      }
2510      break;
2511    case CCValAssign::BCvt:
2512      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2513      break;
2514    }
2515
2516    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2517    Flag = Chain.getValue(1);
2518    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2519  }
2520
2521  RetOps[0] = Chain; // Update chain.
2522
2523  // Add the flag if we have it.
2524  if (Flag.getNode())
2525    RetOps.push_back(Flag);
2526
2527  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
2528}
2529
2530//===----------------------------------------------------------------------===//
2531//  Other Lowering Code
2532//===----------------------------------------------------------------------===//
2533
2534SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
2535                                                  SelectionDAG &DAG) const {
2536  EVT PtrVT = getPointerTy();
2537  SDLoc DL(Op);
2538  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2539  unsigned char OpFlags =
2540      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
2541
2542  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
2543         "unexpected offset in global node");
2544
2545  // This also catched the large code model case for Darwin.
2546  if ((OpFlags & AArch64II::MO_GOT) != 0) {
2547    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
2548    // FIXME: Once remat is capable of dealing with instructions with register
2549    // operands, expand this into two nodes instead of using a wrapper node.
2550    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
2551  }
2552
2553  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2554    const unsigned char MO_NC = AArch64II::MO_NC;
2555    return DAG.getNode(
2556        AArch64ISD::WrapperLarge, DL, PtrVT,
2557        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
2558        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
2559        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
2560        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
2561  } else {
2562    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
2563    // the only correct model on Darwin.
2564    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2565                                            OpFlags | AArch64II::MO_PAGE);
2566    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
2567    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
2568
2569    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
2570    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
2571  }
2572}
2573
2574/// \brief Convert a TLS address reference into the correct sequence of loads
2575/// and calls to compute the variable's address (for Darwin, currently) and
2576/// return an SDValue containing the final node.
2577
2578/// Darwin only has one TLS scheme which must be capable of dealing with the
2579/// fully general situation, in the worst case. This means:
2580///     + "extern __thread" declaration.
2581///     + Defined in a possibly unknown dynamic library.
2582///
2583/// The general system is that each __thread variable has a [3 x i64] descriptor
2584/// which contains information used by the runtime to calculate the address. The
2585/// only part of this the compiler needs to know about is the first xword, which
2586/// contains a function pointer that must be called with the address of the
2587/// entire descriptor in "x0".
2588///
2589/// Since this descriptor may be in a different unit, in general even the
2590/// descriptor must be accessed via an indirect load. The "ideal" code sequence
2591/// is:
2592///     adrp x0, _var@TLVPPAGE
2593///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
2594///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
2595///                                      ; the function pointer
2596///     blr x1                           ; Uses descriptor address in x0
2597///     ; Address of _var is now in x0.
2598///
2599/// If the address of _var's descriptor *is* known to the linker, then it can
2600/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
2601/// a slight efficiency gain.
2602SDValue
2603AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
2604                                                   SelectionDAG &DAG) const {
2605  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
2606
2607  SDLoc DL(Op);
2608  MVT PtrVT = getPointerTy();
2609  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2610
2611  SDValue TLVPAddr =
2612      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
2613  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
2614
2615  // The first entry in the descriptor is a function pointer that we must call
2616  // to obtain the address of the variable.
2617  SDValue Chain = DAG.getEntryNode();
2618  SDValue FuncTLVGet =
2619      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
2620                  false, true, true, 8);
2621  Chain = FuncTLVGet.getValue(1);
2622
2623  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
2624  MFI->setAdjustsStack(true);
2625
2626  // TLS calls preserve all registers except those that absolutely must be
2627  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
2628  // silly).
2629  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2630  const AArch64RegisterInfo *ARI =
2631      static_cast<const AArch64RegisterInfo *>(TRI);
2632  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
2633
2634  // Finally, we can make the call. This is just a degenerate version of a
2635  // normal AArch64 call node: x0 takes the address of the descriptor, and
2636  // returns the address of the variable in this thread.
2637  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
2638  Chain =
2639      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
2640                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
2641                  DAG.getRegisterMask(Mask), Chain.getValue(1));
2642  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
2643}
2644
2645/// When accessing thread-local variables under either the general-dynamic or
2646/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
2647/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
2648/// is a function pointer to carry out the resolution. This function takes the
2649/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
2650/// other registers (except LR, NZCV) are preserved.
2651///
2652/// Thus, the ideal call sequence on AArch64 is:
2653///
2654///     adrp x0, :tlsdesc:thread_var
2655///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
2656///     add x0, x0, :tlsdesc_lo12:thread_var
2657///     .tlsdesccall thread_var
2658///     blr x8
2659///     (TPIDR_EL0 offset now in x0).
2660///
2661/// The ".tlsdesccall" directive instructs the assembler to insert a particular
2662/// relocation to help the linker relax this sequence if it turns out to be too
2663/// conservative.
2664///
2665/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
2666/// is harmless.
2667SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
2668                                                   SDValue DescAddr, SDLoc DL,
2669                                                   SelectionDAG &DAG) const {
2670  EVT PtrVT = getPointerTy();
2671
2672  // The function we need to call is simply the first entry in the GOT for this
2673  // descriptor, load it in preparation.
2674  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
2675
2676  // TLS calls preserve all registers except those that absolutely must be
2677  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
2678  // silly).
2679  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
2680  const AArch64RegisterInfo *ARI =
2681      static_cast<const AArch64RegisterInfo *>(TRI);
2682  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
2683
2684  // The function takes only one argument: the address of the descriptor itself
2685  // in X0.
2686  SDValue Glue, Chain;
2687  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
2688  Glue = Chain.getValue(1);
2689
2690  // We're now ready to populate the argument list, as with a normal call:
2691  SmallVector<SDValue, 6> Ops;
2692  Ops.push_back(Chain);
2693  Ops.push_back(Func);
2694  Ops.push_back(SymAddr);
2695  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
2696  Ops.push_back(DAG.getRegisterMask(Mask));
2697  Ops.push_back(Glue);
2698
2699  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2700  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
2701  Glue = Chain.getValue(1);
2702
2703  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
2704}
2705
2706SDValue
2707AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
2708                                                SelectionDAG &DAG) const {
2709  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
2710  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
2711         "ELF TLS only supported in small memory model");
2712  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2713
2714  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
2715
2716  SDValue TPOff;
2717  EVT PtrVT = getPointerTy();
2718  SDLoc DL(Op);
2719  const GlobalValue *GV = GA->getGlobal();
2720
2721  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
2722
2723  if (Model == TLSModel::LocalExec) {
2724    SDValue HiVar = DAG.getTargetGlobalAddress(
2725        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
2726    SDValue LoVar = DAG.getTargetGlobalAddress(
2727        GV, DL, PtrVT, 0,
2728        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
2729
2730    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
2731                                       DAG.getTargetConstant(16, MVT::i32)),
2732                    0);
2733    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
2734                                       DAG.getTargetConstant(0, MVT::i32)),
2735                    0);
2736  } else if (Model == TLSModel::InitialExec) {
2737    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
2738    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
2739  } else if (Model == TLSModel::LocalDynamic) {
2740    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
2741    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
2742    // the beginning of the module's TLS region, followed by a DTPREL offset
2743    // calculation.
2744
2745    // These accesses will need deduplicating if there's more than one.
2746    AArch64FunctionInfo *MFI =
2747        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
2748    MFI->incNumLocalDynamicTLSAccesses();
2749
2750    // Accesses used in this sequence go via the TLS descriptor which lives in
2751    // the GOT. Prepare an address we can use to handle this.
2752    SDValue HiDesc = DAG.getTargetExternalSymbol(
2753        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
2754    SDValue LoDesc = DAG.getTargetExternalSymbol(
2755        "_TLS_MODULE_BASE_", PtrVT,
2756        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2757
2758    // First argument to the descriptor call is the address of the descriptor
2759    // itself.
2760    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
2761    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
2762
2763    // The call needs a relocation too for linker relaxation. It doesn't make
2764    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
2765    // the address.
2766    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2767                                                  AArch64II::MO_TLS);
2768
2769    // Now we can calculate the offset from TPIDR_EL0 to this module's
2770    // thread-local area.
2771    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
2772
2773    // Now use :dtprel_whatever: operations to calculate this variable's offset
2774    // in its thread-storage area.
2775    SDValue HiVar = DAG.getTargetGlobalAddress(
2776        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
2777    SDValue LoVar = DAG.getTargetGlobalAddress(
2778        GV, DL, MVT::i64, 0,
2779        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
2780
2781    SDValue DTPOff =
2782        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
2783                                   DAG.getTargetConstant(16, MVT::i32)),
2784                0);
2785    DTPOff =
2786        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
2787                                   DAG.getTargetConstant(0, MVT::i32)),
2788                0);
2789
2790    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
2791  } else if (Model == TLSModel::GeneralDynamic) {
2792    // Accesses used in this sequence go via the TLS descriptor which lives in
2793    // the GOT. Prepare an address we can use to handle this.
2794    SDValue HiDesc = DAG.getTargetGlobalAddress(
2795        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
2796    SDValue LoDesc = DAG.getTargetGlobalAddress(
2797        GV, DL, PtrVT, 0,
2798        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2799
2800    // First argument to the descriptor call is the address of the descriptor
2801    // itself.
2802    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
2803    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
2804
2805    // The call needs a relocation too for linker relaxation. It doesn't make
2806    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
2807    // the address.
2808    SDValue SymAddr =
2809        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
2810
2811    // Finally we can make a call to calculate the offset from tpidr_el0.
2812    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
2813  } else
2814    llvm_unreachable("Unsupported ELF TLS access model");
2815
2816  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
2817}
2818
2819SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
2820                                                     SelectionDAG &DAG) const {
2821  if (Subtarget->isTargetDarwin())
2822    return LowerDarwinGlobalTLSAddress(Op, DAG);
2823  else if (Subtarget->isTargetELF())
2824    return LowerELFGlobalTLSAddress(Op, DAG);
2825
2826  llvm_unreachable("Unexpected platform trying to use TLS");
2827}
2828SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
2829  SDValue Chain = Op.getOperand(0);
2830  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2831  SDValue LHS = Op.getOperand(2);
2832  SDValue RHS = Op.getOperand(3);
2833  SDValue Dest = Op.getOperand(4);
2834  SDLoc dl(Op);
2835
2836  // Handle f128 first, since lowering it will result in comparing the return
2837  // value of a libcall against zero, which is just what the rest of LowerBR_CC
2838  // is expecting to deal with.
2839  if (LHS.getValueType() == MVT::f128) {
2840    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2841
2842    // If softenSetCCOperands returned a scalar, we need to compare the result
2843    // against zero to select between true and false values.
2844    if (!RHS.getNode()) {
2845      RHS = DAG.getConstant(0, LHS.getValueType());
2846      CC = ISD::SETNE;
2847    }
2848  }
2849
2850  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
2851  // instruction.
2852  unsigned Opc = LHS.getOpcode();
2853  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
2854      cast<ConstantSDNode>(RHS)->isOne() &&
2855      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
2856       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
2857    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2858           "Unexpected condition code.");
2859    // Only lower legal XALUO ops.
2860    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
2861      return SDValue();
2862
2863    // The actual operation with overflow check.
2864    AArch64CC::CondCode OFCC;
2865    SDValue Value, Overflow;
2866    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
2867
2868    if (CC == ISD::SETNE)
2869      OFCC = getInvertedCondCode(OFCC);
2870    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
2871
2872    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
2873                       CCVal, Overflow);
2874  }
2875
2876  if (LHS.getValueType().isInteger()) {
2877    assert((LHS.getValueType() == RHS.getValueType()) &&
2878           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
2879
2880    // If the RHS of the comparison is zero, we can potentially fold this
2881    // to a specialized branch.
2882    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
2883    if (RHSC && RHSC->getZExtValue() == 0) {
2884      if (CC == ISD::SETEQ) {
2885        // See if we can use a TBZ to fold in an AND as well.
2886        // TBZ has a smaller branch displacement than CBZ.  If the offset is
2887        // out of bounds, a late MI-layer pass rewrites branches.
2888        // 403.gcc is an example that hits this case.
2889        if (LHS.getOpcode() == ISD::AND &&
2890            isa<ConstantSDNode>(LHS.getOperand(1)) &&
2891            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
2892          SDValue Test = LHS.getOperand(0);
2893          uint64_t Mask = LHS.getConstantOperandVal(1);
2894
2895          // TBZ only operates on i64's, but the ext should be free.
2896          if (Test.getValueType() == MVT::i32)
2897            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
2898
2899          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
2900                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
2901        }
2902
2903        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
2904      } else if (CC == ISD::SETNE) {
2905        // See if we can use a TBZ to fold in an AND as well.
2906        // TBZ has a smaller branch displacement than CBZ.  If the offset is
2907        // out of bounds, a late MI-layer pass rewrites branches.
2908        // 403.gcc is an example that hits this case.
2909        if (LHS.getOpcode() == ISD::AND &&
2910            isa<ConstantSDNode>(LHS.getOperand(1)) &&
2911            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
2912          SDValue Test = LHS.getOperand(0);
2913          uint64_t Mask = LHS.getConstantOperandVal(1);
2914
2915          // TBNZ only operates on i64's, but the ext should be free.
2916          if (Test.getValueType() == MVT::i32)
2917            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
2918
2919          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
2920                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
2921        }
2922
2923        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
2924      }
2925    }
2926
2927    SDValue CCVal;
2928    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2929    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
2930                       Cmp);
2931  }
2932
2933  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
2934
2935  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
2936  // clean.  Some of them require two branches to implement.
2937  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2938  AArch64CC::CondCode CC1, CC2;
2939  changeFPCCToAArch64CC(CC, CC1, CC2);
2940  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
2941  SDValue BR1 =
2942      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
2943  if (CC2 != AArch64CC::AL) {
2944    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
2945    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
2946                       Cmp);
2947  }
2948
2949  return BR1;
2950}
2951
2952SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
2953                                              SelectionDAG &DAG) const {
2954  EVT VT = Op.getValueType();
2955  SDLoc DL(Op);
2956
2957  SDValue In1 = Op.getOperand(0);
2958  SDValue In2 = Op.getOperand(1);
2959  EVT SrcVT = In2.getValueType();
2960  if (SrcVT != VT) {
2961    if (SrcVT == MVT::f32 && VT == MVT::f64)
2962      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
2963    else if (SrcVT == MVT::f64 && VT == MVT::f32)
2964      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
2965    else
2966      // FIXME: Src type is different, bail out for now. Can VT really be a
2967      // vector type?
2968      return SDValue();
2969  }
2970
2971  EVT VecVT;
2972  EVT EltVT;
2973  SDValue EltMask, VecVal1, VecVal2;
2974  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
2975    EltVT = MVT::i32;
2976    VecVT = MVT::v4i32;
2977    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
2978
2979    if (!VT.isVector()) {
2980      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
2981                                          DAG.getUNDEF(VecVT), In1);
2982      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
2983                                          DAG.getUNDEF(VecVT), In2);
2984    } else {
2985      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
2986      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
2987    }
2988  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
2989    EltVT = MVT::i64;
2990    VecVT = MVT::v2i64;
2991
2992    // We want to materialize a mask with the the high bit set, but the AdvSIMD
2993    // immediate moves cannot materialize that in a single instruction for
2994    // 64-bit elements. Instead, materialize zero and then negate it.
2995    EltMask = DAG.getConstant(0, EltVT);
2996
2997    if (!VT.isVector()) {
2998      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
2999                                          DAG.getUNDEF(VecVT), In1);
3000      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
3001                                          DAG.getUNDEF(VecVT), In2);
3002    } else {
3003      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
3004      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
3005    }
3006  } else {
3007    llvm_unreachable("Invalid type for copysign!");
3008  }
3009
3010  std::vector<SDValue> BuildVectorOps;
3011  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
3012    BuildVectorOps.push_back(EltMask);
3013
3014  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
3015
3016  // If we couldn't materialize the mask above, then the mask vector will be
3017  // the zero vector, and we need to negate it here.
3018  if (VT == MVT::f64 || VT == MVT::v2f64) {
3019    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
3020    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
3021    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
3022  }
3023
3024  SDValue Sel =
3025      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
3026
3027  if (VT == MVT::f32)
3028    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
3029  else if (VT == MVT::f64)
3030    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
3031  else
3032    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
3033}
3034
3035SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
3036  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
3037          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
3038    return SDValue();
3039
3040  // While there is no integer popcount instruction, it can
3041  // be more efficiently lowered to the following sequence that uses
3042  // AdvSIMD registers/instructions as long as the copies to/from
3043  // the AdvSIMD registers are cheap.
3044  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
3045  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
3046  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
3047  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
3048  SDValue Val = Op.getOperand(0);
3049  SDLoc DL(Op);
3050  EVT VT = Op.getValueType();
3051  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
3052
3053  SDValue VecVal;
3054  if (VT == MVT::i32) {
3055    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
3056    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
3057                                       VecVal);
3058  } else {
3059    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
3060  }
3061
3062  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
3063  SDValue UaddLV = DAG.getNode(
3064      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3065      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
3066
3067  if (VT == MVT::i64)
3068    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
3069  return UaddLV;
3070}
3071
3072SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3073
3074  if (Op.getValueType().isVector())
3075    return LowerVSETCC(Op, DAG);
3076
3077  SDValue LHS = Op.getOperand(0);
3078  SDValue RHS = Op.getOperand(1);
3079  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
3080  SDLoc dl(Op);
3081
3082  // We chose ZeroOrOneBooleanContents, so use zero and one.
3083  EVT VT = Op.getValueType();
3084  SDValue TVal = DAG.getConstant(1, VT);
3085  SDValue FVal = DAG.getConstant(0, VT);
3086
3087  // Handle f128 first, since one possible outcome is a normal integer
3088  // comparison which gets picked up by the next if statement.
3089  if (LHS.getValueType() == MVT::f128) {
3090    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
3091
3092    // If softenSetCCOperands returned a scalar, use it.
3093    if (!RHS.getNode()) {
3094      assert(LHS.getValueType() == Op.getValueType() &&
3095             "Unexpected setcc expansion!");
3096      return LHS;
3097    }
3098  }
3099
3100  if (LHS.getValueType().isInteger()) {
3101    SDValue CCVal;
3102    SDValue Cmp =
3103        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
3104
3105    // Note that we inverted the condition above, so we reverse the order of
3106    // the true and false operands here.  This will allow the setcc to be
3107    // matched to a single CSINC instruction.
3108    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
3109  }
3110
3111  // Now we know we're dealing with FP values.
3112  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3113
3114  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
3115  // and do the comparison.
3116  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3117
3118  AArch64CC::CondCode CC1, CC2;
3119  changeFPCCToAArch64CC(CC, CC1, CC2);
3120  if (CC2 == AArch64CC::AL) {
3121    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
3122    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
3123
3124    // Note that we inverted the condition above, so we reverse the order of
3125    // the true and false operands here.  This will allow the setcc to be
3126    // matched to a single CSINC instruction.
3127    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
3128  } else {
3129    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
3130    // totally clean.  Some of them require two CSELs to implement.  As is in
3131    // this case, we emit the first CSEL and then emit a second using the output
3132    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
3133
3134    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
3135    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
3136    SDValue CS1 =
3137        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
3138
3139    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
3140    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
3141  }
3142}
3143
3144/// A SELECT_CC operation is really some kind of max or min if both values being
3145/// compared are, in some sense, equal to the results in either case. However,
3146/// it is permissible to compare f32 values and produce directly extended f64
3147/// values.
3148///
3149/// Extending the comparison operands would also be allowed, but is less likely
3150/// to happen in practice since their use is right here. Note that truncate
3151/// operations would *not* be semantically equivalent.
3152static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
3153  if (Cmp == Result)
3154    return true;
3155
3156  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
3157  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
3158  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
3159      Result.getValueType() == MVT::f64) {
3160    bool Lossy;
3161    APFloat CmpVal = CCmp->getValueAPF();
3162    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
3163    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
3164  }
3165
3166  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
3167}
3168
3169SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
3170                                           SelectionDAG &DAG) const {
3171  SDValue CC = Op->getOperand(0);
3172  SDValue TVal = Op->getOperand(1);
3173  SDValue FVal = Op->getOperand(2);
3174  SDLoc DL(Op);
3175
3176  unsigned Opc = CC.getOpcode();
3177  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
3178  // instruction.
3179  if (CC.getResNo() == 1 &&
3180      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
3181       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
3182    // Only lower legal XALUO ops.
3183    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
3184      return SDValue();
3185
3186    AArch64CC::CondCode OFCC;
3187    SDValue Value, Overflow;
3188    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
3189    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
3190
3191    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
3192                       CCVal, Overflow);
3193  }
3194
3195  if (CC.getOpcode() == ISD::SETCC)
3196    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
3197                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
3198  else
3199    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
3200                           FVal, ISD::SETNE);
3201}
3202
3203SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
3204                                              SelectionDAG &DAG) const {
3205  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3206  SDValue LHS = Op.getOperand(0);
3207  SDValue RHS = Op.getOperand(1);
3208  SDValue TVal = Op.getOperand(2);
3209  SDValue FVal = Op.getOperand(3);
3210  SDLoc dl(Op);
3211
3212  // Handle f128 first, because it will result in a comparison of some RTLIB
3213  // call result against zero.
3214  if (LHS.getValueType() == MVT::f128) {
3215    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
3216
3217    // If softenSetCCOperands returned a scalar, we need to compare the result
3218    // against zero to select between true and false values.
3219    if (!RHS.getNode()) {
3220      RHS = DAG.getConstant(0, LHS.getValueType());
3221      CC = ISD::SETNE;
3222    }
3223  }
3224
3225  // Handle integers first.
3226  if (LHS.getValueType().isInteger()) {
3227    assert((LHS.getValueType() == RHS.getValueType()) &&
3228           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
3229
3230    unsigned Opcode = AArch64ISD::CSEL;
3231
3232    // If both the TVal and the FVal are constants, see if we can swap them in
3233    // order to for a CSINV or CSINC out of them.
3234    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3235    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3236
3237    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
3238      std::swap(TVal, FVal);
3239      std::swap(CTVal, CFVal);
3240      CC = ISD::getSetCCInverse(CC, true);
3241    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
3242      std::swap(TVal, FVal);
3243      std::swap(CTVal, CFVal);
3244      CC = ISD::getSetCCInverse(CC, true);
3245    } else if (TVal.getOpcode() == ISD::XOR) {
3246      // If TVal is a NOT we want to swap TVal and FVal so that we can match
3247      // with a CSINV rather than a CSEL.
3248      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
3249
3250      if (CVal && CVal->isAllOnesValue()) {
3251        std::swap(TVal, FVal);
3252        std::swap(CTVal, CFVal);
3253        CC = ISD::getSetCCInverse(CC, true);
3254      }
3255    } else if (TVal.getOpcode() == ISD::SUB) {
3256      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
3257      // that we can match with a CSNEG rather than a CSEL.
3258      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
3259
3260      if (CVal && CVal->isNullValue()) {
3261        std::swap(TVal, FVal);
3262        std::swap(CTVal, CFVal);
3263        CC = ISD::getSetCCInverse(CC, true);
3264      }
3265    } else if (CTVal && CFVal) {
3266      const int64_t TrueVal = CTVal->getSExtValue();
3267      const int64_t FalseVal = CFVal->getSExtValue();
3268      bool Swap = false;
3269
3270      // If both TVal and FVal are constants, see if FVal is the
3271      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
3272      // instead of a CSEL in that case.
3273      if (TrueVal == ~FalseVal) {
3274        Opcode = AArch64ISD::CSINV;
3275      } else if (TrueVal == -FalseVal) {
3276        Opcode = AArch64ISD::CSNEG;
3277      } else if (TVal.getValueType() == MVT::i32) {
3278        // If our operands are only 32-bit wide, make sure we use 32-bit
3279        // arithmetic for the check whether we can use CSINC. This ensures that
3280        // the addition in the check will wrap around properly in case there is
3281        // an overflow (which would not be the case if we do the check with
3282        // 64-bit arithmetic).
3283        const uint32_t TrueVal32 = CTVal->getZExtValue();
3284        const uint32_t FalseVal32 = CFVal->getZExtValue();
3285
3286        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
3287          Opcode = AArch64ISD::CSINC;
3288
3289          if (TrueVal32 > FalseVal32) {
3290            Swap = true;
3291          }
3292        }
3293        // 64-bit check whether we can use CSINC.
3294      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
3295        Opcode = AArch64ISD::CSINC;
3296
3297        if (TrueVal > FalseVal) {
3298          Swap = true;
3299        }
3300      }
3301
3302      // Swap TVal and FVal if necessary.
3303      if (Swap) {
3304        std::swap(TVal, FVal);
3305        std::swap(CTVal, CFVal);
3306        CC = ISD::getSetCCInverse(CC, true);
3307      }
3308
3309      if (Opcode != AArch64ISD::CSEL) {
3310        // Drop FVal since we can get its value by simply inverting/negating
3311        // TVal.
3312        FVal = TVal;
3313      }
3314    }
3315
3316    SDValue CCVal;
3317    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3318
3319    EVT VT = Op.getValueType();
3320    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
3321  }
3322
3323  // Now we know we're dealing with FP values.
3324  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3325  assert(LHS.getValueType() == RHS.getValueType());
3326  EVT VT = Op.getValueType();
3327
3328  // Try to match this select into a max/min operation, which have dedicated
3329  // opcode in the instruction set.
3330  // FIXME: This is not correct in the presence of NaNs, so we only enable this
3331  // in no-NaNs mode.
3332  if (getTargetMachine().Options.NoNaNsFPMath) {
3333    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
3334    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
3335        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
3336      CC = ISD::getSetCCSwappedOperands(CC);
3337      std::swap(MinMaxLHS, MinMaxRHS);
3338    }
3339
3340    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
3341        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
3342      switch (CC) {
3343      default:
3344        break;
3345      case ISD::SETGT:
3346      case ISD::SETGE:
3347      case ISD::SETUGT:
3348      case ISD::SETUGE:
3349      case ISD::SETOGT:
3350      case ISD::SETOGE:
3351        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
3352        break;
3353      case ISD::SETLT:
3354      case ISD::SETLE:
3355      case ISD::SETULT:
3356      case ISD::SETULE:
3357      case ISD::SETOLT:
3358      case ISD::SETOLE:
3359        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
3360        break;
3361      }
3362    }
3363  }
3364
3365  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
3366  // and do the comparison.
3367  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3368
3369  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
3370  // clean.  Some of them require two CSELs to implement.
3371  AArch64CC::CondCode CC1, CC2;
3372  changeFPCCToAArch64CC(CC, CC1, CC2);
3373  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
3374  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
3375
3376  // If we need a second CSEL, emit it, using the output of the first as the
3377  // RHS.  We're effectively OR'ing the two CC's together.
3378  if (CC2 != AArch64CC::AL) {
3379    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
3380    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
3381  }
3382
3383  // Otherwise, return the output of the first CSEL.
3384  return CS1;
3385}
3386
3387SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
3388                                              SelectionDAG &DAG) const {
3389  // Jump table entries as PC relative offsets. No additional tweaking
3390  // is necessary here. Just get the address of the jump table.
3391  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
3392  EVT PtrVT = getPointerTy();
3393  SDLoc DL(Op);
3394
3395  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3396      !Subtarget->isTargetMachO()) {
3397    const unsigned char MO_NC = AArch64II::MO_NC;
3398    return DAG.getNode(
3399        AArch64ISD::WrapperLarge, DL, PtrVT,
3400        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
3401        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
3402        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
3403        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
3404                               AArch64II::MO_G0 | MO_NC));
3405  }
3406
3407  SDValue Hi =
3408      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
3409  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
3410                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3411  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
3412  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
3413}
3414
3415SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
3416                                                 SelectionDAG &DAG) const {
3417  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3418  EVT PtrVT = getPointerTy();
3419  SDLoc DL(Op);
3420
3421  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
3422    // Use the GOT for the large code model on iOS.
3423    if (Subtarget->isTargetMachO()) {
3424      SDValue GotAddr = DAG.getTargetConstantPool(
3425          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
3426          AArch64II::MO_GOT);
3427      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
3428    }
3429
3430    const unsigned char MO_NC = AArch64II::MO_NC;
3431    return DAG.getNode(
3432        AArch64ISD::WrapperLarge, DL, PtrVT,
3433        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
3434                                  CP->getOffset(), AArch64II::MO_G3),
3435        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
3436                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
3437        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
3438                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
3439        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
3440                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
3441  } else {
3442    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
3443    // ELF, the only valid one on Darwin.
3444    SDValue Hi =
3445        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
3446                                  CP->getOffset(), AArch64II::MO_PAGE);
3447    SDValue Lo = DAG.getTargetConstantPool(
3448        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
3449        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3450
3451    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
3452    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
3453  }
3454}
3455
3456SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
3457                                               SelectionDAG &DAG) const {
3458  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3459  EVT PtrVT = getPointerTy();
3460  SDLoc DL(Op);
3461  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3462      !Subtarget->isTargetMachO()) {
3463    const unsigned char MO_NC = AArch64II::MO_NC;
3464    return DAG.getNode(
3465        AArch64ISD::WrapperLarge, DL, PtrVT,
3466        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
3467        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
3468        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
3469        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
3470  } else {
3471    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
3472    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
3473                                                             AArch64II::MO_NC);
3474    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
3475    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
3476  }
3477}
3478
3479SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
3480                                                 SelectionDAG &DAG) const {
3481  AArch64FunctionInfo *FuncInfo =
3482      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
3483
3484  SDLoc DL(Op);
3485  SDValue FR =
3486      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
3487  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3488  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
3489                      MachinePointerInfo(SV), false, false, 0);
3490}
3491
3492SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
3493                                                SelectionDAG &DAG) const {
3494  // The layout of the va_list struct is specified in the AArch64 Procedure Call
3495  // Standard, section B.3.
3496  MachineFunction &MF = DAG.getMachineFunction();
3497  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3498  SDLoc DL(Op);
3499
3500  SDValue Chain = Op.getOperand(0);
3501  SDValue VAList = Op.getOperand(1);
3502  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3503  SmallVector<SDValue, 4> MemOps;
3504
3505  // void *__stack at offset 0
3506  SDValue Stack =
3507      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
3508  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
3509                                MachinePointerInfo(SV), false, false, 8));
3510
3511  // void *__gr_top at offset 8
3512  int GPRSize = FuncInfo->getVarArgsGPRSize();
3513  if (GPRSize > 0) {
3514    SDValue GRTop, GRTopAddr;
3515
3516    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3517                            DAG.getConstant(8, getPointerTy()));
3518
3519    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
3520    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
3521                        DAG.getConstant(GPRSize, getPointerTy()));
3522
3523    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
3524                                  MachinePointerInfo(SV, 8), false, false, 8));
3525  }
3526
3527  // void *__vr_top at offset 16
3528  int FPRSize = FuncInfo->getVarArgsFPRSize();
3529  if (FPRSize > 0) {
3530    SDValue VRTop, VRTopAddr;
3531    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3532                            DAG.getConstant(16, getPointerTy()));
3533
3534    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
3535    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
3536                        DAG.getConstant(FPRSize, getPointerTy()));
3537
3538    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
3539                                  MachinePointerInfo(SV, 16), false, false, 8));
3540  }
3541
3542  // int __gr_offs at offset 24
3543  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3544                                   DAG.getConstant(24, getPointerTy()));
3545  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
3546                                GROffsAddr, MachinePointerInfo(SV, 24), false,
3547                                false, 4));
3548
3549  // int __vr_offs at offset 28
3550  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3551                                   DAG.getConstant(28, getPointerTy()));
3552  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
3553                                VROffsAddr, MachinePointerInfo(SV, 28), false,
3554                                false, 4));
3555
3556  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3557}
3558
3559SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
3560                                            SelectionDAG &DAG) const {
3561  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
3562                                     : LowerAAPCS_VASTART(Op, DAG);
3563}
3564
3565SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
3566                                           SelectionDAG &DAG) const {
3567  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
3568  // pointer.
3569  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
3570  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
3571  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
3572
3573  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
3574                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
3575                       8, false, false, MachinePointerInfo(DestSV),
3576                       MachinePointerInfo(SrcSV));
3577}
3578
3579SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3580  assert(Subtarget->isTargetDarwin() &&
3581         "automatic va_arg instruction only works on Darwin");
3582
3583  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3584  EVT VT = Op.getValueType();
3585  SDLoc DL(Op);
3586  SDValue Chain = Op.getOperand(0);
3587  SDValue Addr = Op.getOperand(1);
3588  unsigned Align = Op.getConstantOperandVal(3);
3589
3590  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
3591                               MachinePointerInfo(V), false, false, false, 0);
3592  Chain = VAList.getValue(1);
3593
3594  if (Align > 8) {
3595    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
3596    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3597                         DAG.getConstant(Align - 1, getPointerTy()));
3598    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
3599                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
3600  }
3601
3602  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
3603  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
3604
3605  // Scalar integer and FP values smaller than 64 bits are implicitly extended
3606  // up to 64 bits.  At the very least, we have to increase the striding of the
3607  // vaargs list to match this, and for FP values we need to introduce
3608  // FP_ROUND nodes as well.
3609  if (VT.isInteger() && !VT.isVector())
3610    ArgSize = 8;
3611  bool NeedFPTrunc = false;
3612  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
3613    ArgSize = 8;
3614    NeedFPTrunc = true;
3615  }
3616
3617  // Increment the pointer, VAList, to the next vaarg
3618  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
3619                               DAG.getConstant(ArgSize, getPointerTy()));
3620  // Store the incremented VAList to the legalized pointer
3621  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
3622                                 false, false, 0);
3623
3624  // Load the actual argument out of the pointer VAList
3625  if (NeedFPTrunc) {
3626    // Load the value as an f64.
3627    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
3628                                 MachinePointerInfo(), false, false, false, 0);
3629    // Round the value down to an f32.
3630    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
3631                                   DAG.getIntPtrConstant(1));
3632    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
3633    // Merge the rounded value with the chain output of the load.
3634    return DAG.getMergeValues(Ops, DL);
3635  }
3636
3637  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
3638                     false, false, 0);
3639}
3640
3641SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
3642                                              SelectionDAG &DAG) const {
3643  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3644  MFI->setFrameAddressIsTaken(true);
3645
3646  EVT VT = Op.getValueType();
3647  SDLoc DL(Op);
3648  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3649  SDValue FrameAddr =
3650      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
3651  while (Depth--)
3652    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
3653                            MachinePointerInfo(), false, false, false, 0);
3654  return FrameAddr;
3655}
3656
3657// FIXME? Maybe this could be a TableGen attribute on some registers and
3658// this table could be generated automatically from RegInfo.
3659unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
3660                                                  EVT VT) const {
3661  unsigned Reg = StringSwitch<unsigned>(RegName)
3662                       .Case("sp", AArch64::SP)
3663                       .Default(0);
3664  if (Reg)
3665    return Reg;
3666  report_fatal_error("Invalid register name global variable");
3667}
3668
3669SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
3670                                               SelectionDAG &DAG) const {
3671  MachineFunction &MF = DAG.getMachineFunction();
3672  MachineFrameInfo *MFI = MF.getFrameInfo();
3673  MFI->setReturnAddressIsTaken(true);
3674
3675  EVT VT = Op.getValueType();
3676  SDLoc DL(Op);
3677  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3678  if (Depth) {
3679    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
3680    SDValue Offset = DAG.getConstant(8, getPointerTy());
3681    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
3682                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
3683                       MachinePointerInfo(), false, false, false, 0);
3684  }
3685
3686  // Return LR, which contains the return address. Mark it an implicit live-in.
3687  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
3688  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
3689}
3690
3691/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
3692/// i64 values and take a 2 x i64 value to shift plus a shift amount.
3693SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
3694                                                    SelectionDAG &DAG) const {
3695  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3696  EVT VT = Op.getValueType();
3697  unsigned VTBits = VT.getSizeInBits();
3698  SDLoc dl(Op);
3699  SDValue ShOpLo = Op.getOperand(0);
3700  SDValue ShOpHi = Op.getOperand(1);
3701  SDValue ShAmt = Op.getOperand(2);
3702  SDValue ARMcc;
3703  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
3704
3705  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
3706
3707  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
3708                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
3709  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
3710  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
3711                                   DAG.getConstant(VTBits, MVT::i64));
3712  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
3713
3714  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
3715                               ISD::SETGE, dl, DAG);
3716  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
3717
3718  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3719  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
3720  SDValue Lo =
3721      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
3722
3723  // AArch64 shifts larger than the register width are wrapped rather than
3724  // clamped, so we can't just emit "hi >> x".
3725  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
3726  SDValue TrueValHi = Opc == ISD::SRA
3727                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
3728                                        DAG.getConstant(VTBits - 1, MVT::i64))
3729                          : DAG.getConstant(0, VT);
3730  SDValue Hi =
3731      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
3732
3733  SDValue Ops[2] = { Lo, Hi };
3734  return DAG.getMergeValues(Ops, dl);
3735}
3736
3737/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
3738/// i64 values and take a 2 x i64 value to shift plus a shift amount.
3739SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
3740                                                 SelectionDAG &DAG) const {
3741  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3742  EVT VT = Op.getValueType();
3743  unsigned VTBits = VT.getSizeInBits();
3744  SDLoc dl(Op);
3745  SDValue ShOpLo = Op.getOperand(0);
3746  SDValue ShOpHi = Op.getOperand(1);
3747  SDValue ShAmt = Op.getOperand(2);
3748  SDValue ARMcc;
3749
3750  assert(Op.getOpcode() == ISD::SHL_PARTS);
3751  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
3752                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
3753  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
3754  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
3755                                   DAG.getConstant(VTBits, MVT::i64));
3756  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
3757  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
3758
3759  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3760
3761  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
3762                               ISD::SETGE, dl, DAG);
3763  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
3764  SDValue Hi =
3765      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
3766
3767  // AArch64 shifts of larger than register sizes are wrapped rather than
3768  // clamped, so we can't just emit "lo << a" if a is too big.
3769  SDValue TrueValLo = DAG.getConstant(0, VT);
3770  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
3771  SDValue Lo =
3772      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
3773
3774  SDValue Ops[2] = { Lo, Hi };
3775  return DAG.getMergeValues(Ops, dl);
3776}
3777
3778bool AArch64TargetLowering::isOffsetFoldingLegal(
3779    const GlobalAddressSDNode *GA) const {
3780  // The AArch64 target doesn't support folding offsets into global addresses.
3781  return false;
3782}
3783
3784bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3785  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
3786  // FIXME: We should be able to handle f128 as well with a clever lowering.
3787  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
3788    return true;
3789
3790  if (VT == MVT::f64)
3791    return AArch64_AM::getFP64Imm(Imm) != -1;
3792  else if (VT == MVT::f32)
3793    return AArch64_AM::getFP32Imm(Imm) != -1;
3794  return false;
3795}
3796
3797//===----------------------------------------------------------------------===//
3798//                          AArch64 Optimization Hooks
3799//===----------------------------------------------------------------------===//
3800
3801//===----------------------------------------------------------------------===//
3802//                          AArch64 Inline Assembly Support
3803//===----------------------------------------------------------------------===//
3804
3805// Table of Constraints
3806// TODO: This is the current set of constraints supported by ARM for the
3807// compiler, not all of them may make sense, e.g. S may be difficult to support.
3808//
3809// r - A general register
3810// w - An FP/SIMD register of some size in the range v0-v31
3811// x - An FP/SIMD register of some size in the range v0-v15
3812// I - Constant that can be used with an ADD instruction
3813// J - Constant that can be used with a SUB instruction
3814// K - Constant that can be used with a 32-bit logical instruction
3815// L - Constant that can be used with a 64-bit logical instruction
3816// M - Constant that can be used as a 32-bit MOV immediate
3817// N - Constant that can be used as a 64-bit MOV immediate
3818// Q - A memory reference with base register and no offset
3819// S - A symbolic address
3820// Y - Floating point constant zero
3821// Z - Integer constant zero
3822//
3823//   Note that general register operands will be output using their 64-bit x
3824// register name, whatever the size of the variable, unless the asm operand
3825// is prefixed by the %w modifier. Floating-point and SIMD register operands
3826// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
3827// %q modifier.
3828
3829/// getConstraintType - Given a constraint letter, return the type of
3830/// constraint it is for this target.
3831AArch64TargetLowering::ConstraintType
3832AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
3833  if (Constraint.size() == 1) {
3834    switch (Constraint[0]) {
3835    default:
3836      break;
3837    case 'z':
3838      return C_Other;
3839    case 'x':
3840    case 'w':
3841      return C_RegisterClass;
3842    // An address with a single base register. Due to the way we
3843    // currently handle addresses it is the same as 'r'.
3844    case 'Q':
3845      return C_Memory;
3846    }
3847  }
3848  return TargetLowering::getConstraintType(Constraint);
3849}
3850
3851/// Examine constraint type and operand type and determine a weight value.
3852/// This object must already have been set up with the operand type
3853/// and the current alternative constraint selected.
3854TargetLowering::ConstraintWeight
3855AArch64TargetLowering::getSingleConstraintMatchWeight(
3856    AsmOperandInfo &info, const char *constraint) const {
3857  ConstraintWeight weight = CW_Invalid;
3858  Value *CallOperandVal = info.CallOperandVal;
3859  // If we don't have a value, we can't do a match,
3860  // but allow it at the lowest weight.
3861  if (!CallOperandVal)
3862    return CW_Default;
3863  Type *type = CallOperandVal->getType();
3864  // Look at the constraint type.
3865  switch (*constraint) {
3866  default:
3867    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3868    break;
3869  case 'x':
3870  case 'w':
3871    if (type->isFloatingPointTy() || type->isVectorTy())
3872      weight = CW_Register;
3873    break;
3874  case 'z':
3875    weight = CW_Constant;
3876    break;
3877  }
3878  return weight;
3879}
3880
3881std::pair<unsigned, const TargetRegisterClass *>
3882AArch64TargetLowering::getRegForInlineAsmConstraint(
3883    const std::string &Constraint, MVT VT) const {
3884  if (Constraint.size() == 1) {
3885    switch (Constraint[0]) {
3886    case 'r':
3887      if (VT.getSizeInBits() == 64)
3888        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
3889      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
3890    case 'w':
3891      if (VT == MVT::f32)
3892        return std::make_pair(0U, &AArch64::FPR32RegClass);
3893      if (VT.getSizeInBits() == 64)
3894        return std::make_pair(0U, &AArch64::FPR64RegClass);
3895      if (VT.getSizeInBits() == 128)
3896        return std::make_pair(0U, &AArch64::FPR128RegClass);
3897      break;
3898    // The instructions that this constraint is designed for can
3899    // only take 128-bit registers so just use that regclass.
3900    case 'x':
3901      if (VT.getSizeInBits() == 128)
3902        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
3903      break;
3904    }
3905  }
3906  if (StringRef("{cc}").equals_lower(Constraint))
3907    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
3908
3909  // Use the default implementation in TargetLowering to convert the register
3910  // constraint into a member of a register class.
3911  std::pair<unsigned, const TargetRegisterClass *> Res;
3912  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3913
3914  // Not found as a standard register?
3915  if (!Res.second) {
3916    unsigned Size = Constraint.size();
3917    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
3918        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
3919      const std::string Reg =
3920          std::string(&Constraint[2], &Constraint[Size - 1]);
3921      int RegNo = atoi(Reg.c_str());
3922      if (RegNo >= 0 && RegNo <= 31) {
3923        // v0 - v31 are aliases of q0 - q31.
3924        // By default we'll emit v0-v31 for this unless there's a modifier where
3925        // we'll emit the correct register as well.
3926        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
3927        Res.second = &AArch64::FPR128RegClass;
3928      }
3929    }
3930  }
3931
3932  return Res;
3933}
3934
3935/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
3936/// vector.  If it is invalid, don't add anything to Ops.
3937void AArch64TargetLowering::LowerAsmOperandForConstraint(
3938    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
3939    SelectionDAG &DAG) const {
3940  SDValue Result;
3941
3942  // Currently only support length 1 constraints.
3943  if (Constraint.length() != 1)
3944    return;
3945
3946  char ConstraintLetter = Constraint[0];
3947  switch (ConstraintLetter) {
3948  default:
3949    break;
3950
3951  // This set of constraints deal with valid constants for various instructions.
3952  // Validate and return a target constant for them if we can.
3953  case 'z': {
3954    // 'z' maps to xzr or wzr so it needs an input of 0.
3955    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
3956    if (!C || C->getZExtValue() != 0)
3957      return;
3958
3959    if (Op.getValueType() == MVT::i64)
3960      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
3961    else
3962      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
3963    break;
3964  }
3965
3966  case 'I':
3967  case 'J':
3968  case 'K':
3969  case 'L':
3970  case 'M':
3971  case 'N':
3972    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
3973    if (!C)
3974      return;
3975
3976    // Grab the value and do some validation.
3977    uint64_t CVal = C->getZExtValue();
3978    switch (ConstraintLetter) {
3979    // The I constraint applies only to simple ADD or SUB immediate operands:
3980    // i.e. 0 to 4095 with optional shift by 12
3981    // The J constraint applies only to ADD or SUB immediates that would be
3982    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
3983    // instruction [or vice versa], in other words -1 to -4095 with optional
3984    // left shift by 12.
3985    case 'I':
3986      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
3987        break;
3988      return;
3989    case 'J': {
3990      uint64_t NVal = -C->getSExtValue();
3991      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
3992        break;
3993      return;
3994    }
3995    // The K and L constraints apply *only* to logical immediates, including
3996    // what used to be the MOVI alias for ORR (though the MOVI alias has now
3997    // been removed and MOV should be used). So these constraints have to
3998    // distinguish between bit patterns that are valid 32-bit or 64-bit
3999    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
4000    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
4001    // versa.
4002    case 'K':
4003      if (AArch64_AM::isLogicalImmediate(CVal, 32))
4004        break;
4005      return;
4006    case 'L':
4007      if (AArch64_AM::isLogicalImmediate(CVal, 64))
4008        break;
4009      return;
4010    // The M and N constraints are a superset of K and L respectively, for use
4011    // with the MOV (immediate) alias. As well as the logical immediates they
4012    // also match 32 or 64-bit immediates that can be loaded either using a
4013    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
4014    // (M) or 64-bit 0x1234000000000000 (N) etc.
4015    // As a note some of this code is liberally stolen from the asm parser.
4016    case 'M': {
4017      if (!isUInt<32>(CVal))
4018        return;
4019      if (AArch64_AM::isLogicalImmediate(CVal, 32))
4020        break;
4021      if ((CVal & 0xFFFF) == CVal)
4022        break;
4023      if ((CVal & 0xFFFF0000ULL) == CVal)
4024        break;
4025      uint64_t NCVal = ~(uint32_t)CVal;
4026      if ((NCVal & 0xFFFFULL) == NCVal)
4027        break;
4028      if ((NCVal & 0xFFFF0000ULL) == NCVal)
4029        break;
4030      return;
4031    }
4032    case 'N': {
4033      if (AArch64_AM::isLogicalImmediate(CVal, 64))
4034        break;
4035      if ((CVal & 0xFFFFULL) == CVal)
4036        break;
4037      if ((CVal & 0xFFFF0000ULL) == CVal)
4038        break;
4039      if ((CVal & 0xFFFF00000000ULL) == CVal)
4040        break;
4041      if ((CVal & 0xFFFF000000000000ULL) == CVal)
4042        break;
4043      uint64_t NCVal = ~CVal;
4044      if ((NCVal & 0xFFFFULL) == NCVal)
4045        break;
4046      if ((NCVal & 0xFFFF0000ULL) == NCVal)
4047        break;
4048      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
4049        break;
4050      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
4051        break;
4052      return;
4053    }
4054    default:
4055      return;
4056    }
4057
4058    // All assembler immediates are 64-bit integers.
4059    Result = DAG.getTargetConstant(CVal, MVT::i64);
4060    break;
4061  }
4062
4063  if (Result.getNode()) {
4064    Ops.push_back(Result);
4065    return;
4066  }
4067
4068  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
4069}
4070
4071//===----------------------------------------------------------------------===//
4072//                     AArch64 Advanced SIMD Support
4073//===----------------------------------------------------------------------===//
4074
4075/// WidenVector - Given a value in the V64 register class, produce the
4076/// equivalent value in the V128 register class.
4077static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
4078  EVT VT = V64Reg.getValueType();
4079  unsigned NarrowSize = VT.getVectorNumElements();
4080  MVT EltTy = VT.getVectorElementType().getSimpleVT();
4081  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
4082  SDLoc DL(V64Reg);
4083
4084  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
4085                     V64Reg, DAG.getConstant(0, MVT::i32));
4086}
4087
4088/// getExtFactor - Determine the adjustment factor for the position when
4089/// generating an "extract from vector registers" instruction.
4090static unsigned getExtFactor(SDValue &V) {
4091  EVT EltType = V.getValueType().getVectorElementType();
4092  return EltType.getSizeInBits() / 8;
4093}
4094
4095/// NarrowVector - Given a value in the V128 register class, produce the
4096/// equivalent value in the V64 register class.
4097static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
4098  EVT VT = V128Reg.getValueType();
4099  unsigned WideSize = VT.getVectorNumElements();
4100  MVT EltTy = VT.getVectorElementType().getSimpleVT();
4101  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
4102  SDLoc DL(V128Reg);
4103
4104  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
4105}
4106
4107// Gather data to see if the operation can be modelled as a
4108// shuffle in combination with VEXTs.
4109SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
4110                                                  SelectionDAG &DAG) const {
4111  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
4112  SDLoc dl(Op);
4113  EVT VT = Op.getValueType();
4114  unsigned NumElts = VT.getVectorNumElements();
4115
4116  SmallVector<SDValue, 2> SourceVecs;
4117  SmallVector<unsigned, 2> MinElts;
4118  SmallVector<unsigned, 2> MaxElts;
4119
4120  for (unsigned i = 0; i < NumElts; ++i) {
4121    SDValue V = Op.getOperand(i);
4122    if (V.getOpcode() == ISD::UNDEF)
4123      continue;
4124    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
4125      // A shuffle can only come from building a vector from various
4126      // elements of other vectors.
4127      return SDValue();
4128    }
4129
4130    // Record this extraction against the appropriate vector if possible...
4131    SDValue SourceVec = V.getOperand(0);
4132    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
4133    bool FoundSource = false;
4134    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
4135      if (SourceVecs[j] == SourceVec) {
4136        if (MinElts[j] > EltNo)
4137          MinElts[j] = EltNo;
4138        if (MaxElts[j] < EltNo)
4139          MaxElts[j] = EltNo;
4140        FoundSource = true;
4141        break;
4142      }
4143    }
4144
4145    // Or record a new source if not...
4146    if (!FoundSource) {
4147      SourceVecs.push_back(SourceVec);
4148      MinElts.push_back(EltNo);
4149      MaxElts.push_back(EltNo);
4150    }
4151  }
4152
4153  // Currently only do something sane when at most two source vectors
4154  // involved.
4155  if (SourceVecs.size() > 2)
4156    return SDValue();
4157
4158  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
4159  int VEXTOffsets[2] = { 0, 0 };
4160  int OffsetMultipliers[2] = { 1, 1 };
4161
4162  // This loop extracts the usage patterns of the source vectors
4163  // and prepares appropriate SDValues for a shuffle if possible.
4164  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
4165    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
4166    SDValue CurSource = SourceVecs[i];
4167    if (SourceVecs[i].getValueType().getVectorElementType() !=
4168        VT.getVectorElementType()) {
4169      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
4170      // Then bitcast it to the vector which holds asserted element type,
4171      // and record the multiplier of element width between SourceVecs and
4172      // Build_vector which is needed to extract the correct lanes later.
4173      EVT CastVT =
4174          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
4175                           SourceVecs[i].getValueSizeInBits() /
4176                               VT.getVectorElementType().getSizeInBits());
4177
4178      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
4179      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
4180      NumSrcElts *= OffsetMultipliers[i];
4181      MaxElts[i] *= OffsetMultipliers[i];
4182      MinElts[i] *= OffsetMultipliers[i];
4183    }
4184
4185    if (CurSource.getValueType() == VT) {
4186      // No VEXT necessary
4187      ShuffleSrcs[i] = CurSource;
4188      VEXTOffsets[i] = 0;
4189      continue;
4190    } else if (NumSrcElts < NumElts) {
4191      // We can pad out the smaller vector for free, so if it's part of a
4192      // shuffle...
4193      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
4194                                   DAG.getUNDEF(CurSource.getValueType()));
4195      continue;
4196    }
4197
4198    // Since only 64-bit and 128-bit vectors are legal on ARM and
4199    // we've eliminated the other cases...
4200    assert(NumSrcElts == 2 * NumElts &&
4201           "unexpected vector sizes in ReconstructShuffle");
4202
4203    if (MaxElts[i] - MinElts[i] >= NumElts) {
4204      // Span too large for a VEXT to cope
4205      return SDValue();
4206    }
4207
4208    if (MinElts[i] >= NumElts) {
4209      // The extraction can just take the second half
4210      VEXTOffsets[i] = NumElts;
4211      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
4212                                   DAG.getIntPtrConstant(NumElts));
4213    } else if (MaxElts[i] < NumElts) {
4214      // The extraction can just take the first half
4215      VEXTOffsets[i] = 0;
4216      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
4217                                   DAG.getIntPtrConstant(0));
4218    } else {
4219      // An actual VEXT is needed
4220      VEXTOffsets[i] = MinElts[i];
4221      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
4222                                     DAG.getIntPtrConstant(0));
4223      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
4224                                     DAG.getIntPtrConstant(NumElts));
4225      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
4226      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
4227                                   DAG.getConstant(Imm, MVT::i32));
4228    }
4229  }
4230
4231  SmallVector<int, 8> Mask;
4232
4233  for (unsigned i = 0; i < NumElts; ++i) {
4234    SDValue Entry = Op.getOperand(i);
4235    if (Entry.getOpcode() == ISD::UNDEF) {
4236      Mask.push_back(-1);
4237      continue;
4238    }
4239
4240    SDValue ExtractVec = Entry.getOperand(0);
4241    int ExtractElt =
4242        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
4243    if (ExtractVec == SourceVecs[0]) {
4244      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
4245    } else {
4246      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
4247                     VEXTOffsets[1]);
4248    }
4249  }
4250
4251  // Final check before we try to produce nonsense...
4252  if (isShuffleMaskLegal(Mask, VT))
4253    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
4254                                &Mask[0]);
4255
4256  return SDValue();
4257}
4258
4259// check if an EXT instruction can handle the shuffle mask when the
4260// vector sources of the shuffle are the same.
4261static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
4262  unsigned NumElts = VT.getVectorNumElements();
4263
4264  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
4265  if (M[0] < 0)
4266    return false;
4267
4268  Imm = M[0];
4269
4270  // If this is a VEXT shuffle, the immediate value is the index of the first
4271  // element.  The other shuffle indices must be the successive elements after
4272  // the first one.
4273  unsigned ExpectedElt = Imm;
4274  for (unsigned i = 1; i < NumElts; ++i) {
4275    // Increment the expected index.  If it wraps around, just follow it
4276    // back to index zero and keep going.
4277    ++ExpectedElt;
4278    if (ExpectedElt == NumElts)
4279      ExpectedElt = 0;
4280
4281    if (M[i] < 0)
4282      continue; // ignore UNDEF indices
4283    if (ExpectedElt != static_cast<unsigned>(M[i]))
4284      return false;
4285  }
4286
4287  return true;
4288}
4289
4290// check if an EXT instruction can handle the shuffle mask when the
4291// vector sources of the shuffle are different.
4292static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
4293                      unsigned &Imm) {
4294  // Look for the first non-undef element.
4295  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
4296      [](int Elt) {return Elt >= 0;});
4297
4298  // Benefit form APInt to handle overflow when calculating expected element.
4299  unsigned NumElts = VT.getVectorNumElements();
4300  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
4301  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
4302  // The following shuffle indices must be the successive elements after the
4303  // first real element.
4304  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
4305      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
4306  if (FirstWrongElt != M.end())
4307    return false;
4308
4309  // The index of an EXT is the first element if it is not UNDEF.
4310  // Watch out for the beginning UNDEFs. The EXT index should be the expected
4311  // value of the first element.  E.g.
4312  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
4313  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
4314  // ExpectedElt is the last mask index plus 1.
4315  Imm = ExpectedElt.getZExtValue();
4316
4317  // There are two difference cases requiring to reverse input vectors.
4318  // For example, for vector <4 x i32> we have the following cases,
4319  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
4320  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
4321  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
4322  // to reverse two input vectors.
4323  if (Imm < NumElts)
4324    ReverseEXT = true;
4325  else
4326    Imm -= NumElts;
4327
4328  return true;
4329}
4330
4331/// isREVMask - Check if a vector shuffle corresponds to a REV
4332/// instruction with the specified blocksize.  (The order of the elements
4333/// within each block of the vector is reversed.)
4334static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4335  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
4336         "Only possible block sizes for REV are: 16, 32, 64");
4337
4338  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4339  if (EltSz == 64)
4340    return false;
4341
4342  unsigned NumElts = VT.getVectorNumElements();
4343  unsigned BlockElts = M[0] + 1;
4344  // If the first shuffle index is UNDEF, be optimistic.
4345  if (M[0] < 0)
4346    BlockElts = BlockSize / EltSz;
4347
4348  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4349    return false;
4350
4351  for (unsigned i = 0; i < NumElts; ++i) {
4352    if (M[i] < 0)
4353      continue; // ignore UNDEF indices
4354    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
4355      return false;
4356  }
4357
4358  return true;
4359}
4360
4361static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4362  unsigned NumElts = VT.getVectorNumElements();
4363  WhichResult = (M[0] == 0 ? 0 : 1);
4364  unsigned Idx = WhichResult * NumElts / 2;
4365  for (unsigned i = 0; i != NumElts; i += 2) {
4366    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
4367        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
4368      return false;
4369    Idx += 1;
4370  }
4371
4372  return true;
4373}
4374
4375static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4376  unsigned NumElts = VT.getVectorNumElements();
4377  WhichResult = (M[0] == 0 ? 0 : 1);
4378  for (unsigned i = 0; i != NumElts; ++i) {
4379    if (M[i] < 0)
4380      continue; // ignore UNDEF indices
4381    if ((unsigned)M[i] != 2 * i + WhichResult)
4382      return false;
4383  }
4384
4385  return true;
4386}
4387
4388static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4389  unsigned NumElts = VT.getVectorNumElements();
4390  WhichResult = (M[0] == 0 ? 0 : 1);
4391  for (unsigned i = 0; i < NumElts; i += 2) {
4392    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
4393        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
4394      return false;
4395  }
4396  return true;
4397}
4398
4399/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
4400/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4401/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
4402static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4403  unsigned NumElts = VT.getVectorNumElements();
4404  WhichResult = (M[0] == 0 ? 0 : 1);
4405  unsigned Idx = WhichResult * NumElts / 2;
4406  for (unsigned i = 0; i != NumElts; i += 2) {
4407    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
4408        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
4409      return false;
4410    Idx += 1;
4411  }
4412
4413  return true;
4414}
4415
4416/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
4417/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4418/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
4419static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4420  unsigned Half = VT.getVectorNumElements() / 2;
4421  WhichResult = (M[0] == 0 ? 0 : 1);
4422  for (unsigned j = 0; j != 2; ++j) {
4423    unsigned Idx = WhichResult;
4424    for (unsigned i = 0; i != Half; ++i) {
4425      int MIdx = M[i + j * Half];
4426      if (MIdx >= 0 && (unsigned)MIdx != Idx)
4427        return false;
4428      Idx += 2;
4429    }
4430  }
4431
4432  return true;
4433}
4434
4435/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
4436/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4437/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
4438static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4439  unsigned NumElts = VT.getVectorNumElements();
4440  WhichResult = (M[0] == 0 ? 0 : 1);
4441  for (unsigned i = 0; i < NumElts; i += 2) {
4442    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
4443        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
4444      return false;
4445  }
4446  return true;
4447}
4448
4449static bool isINSMask(ArrayRef<int> M, int NumInputElements,
4450                      bool &DstIsLeft, int &Anomaly) {
4451  if (M.size() != static_cast<size_t>(NumInputElements))
4452    return false;
4453
4454  int NumLHSMatch = 0, NumRHSMatch = 0;
4455  int LastLHSMismatch = -1, LastRHSMismatch = -1;
4456
4457  for (int i = 0; i < NumInputElements; ++i) {
4458    if (M[i] == -1) {
4459      ++NumLHSMatch;
4460      ++NumRHSMatch;
4461      continue;
4462    }
4463
4464    if (M[i] == i)
4465      ++NumLHSMatch;
4466    else
4467      LastLHSMismatch = i;
4468
4469    if (M[i] == i + NumInputElements)
4470      ++NumRHSMatch;
4471    else
4472      LastRHSMismatch = i;
4473  }
4474
4475  if (NumLHSMatch == NumInputElements - 1) {
4476    DstIsLeft = true;
4477    Anomaly = LastLHSMismatch;
4478    return true;
4479  } else if (NumRHSMatch == NumInputElements - 1) {
4480    DstIsLeft = false;
4481    Anomaly = LastRHSMismatch;
4482    return true;
4483  }
4484
4485  return false;
4486}
4487
4488static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
4489  if (VT.getSizeInBits() != 128)
4490    return false;
4491
4492  unsigned NumElts = VT.getVectorNumElements();
4493
4494  for (int I = 0, E = NumElts / 2; I != E; I++) {
4495    if (Mask[I] != I)
4496      return false;
4497  }
4498
4499  int Offset = NumElts / 2;
4500  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
4501    if (Mask[I] != I + SplitLHS * Offset)
4502      return false;
4503  }
4504
4505  return true;
4506}
4507
4508static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
4509  SDLoc DL(Op);
4510  EVT VT = Op.getValueType();
4511  SDValue V0 = Op.getOperand(0);
4512  SDValue V1 = Op.getOperand(1);
4513  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
4514
4515  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
4516      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
4517    return SDValue();
4518
4519  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
4520
4521  if (!isConcatMask(Mask, VT, SplitV0))
4522    return SDValue();
4523
4524  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
4525                                VT.getVectorNumElements() / 2);
4526  if (SplitV0) {
4527    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
4528                     DAG.getConstant(0, MVT::i64));
4529  }
4530  if (V1.getValueType().getSizeInBits() == 128) {
4531    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
4532                     DAG.getConstant(0, MVT::i64));
4533  }
4534  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
4535}
4536
4537/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
4538/// the specified operations to build the shuffle.
4539static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
4540                                      SDValue RHS, SelectionDAG &DAG,
4541                                      SDLoc dl) {
4542  unsigned OpNum = (PFEntry >> 26) & 0x0F;
4543  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
4544  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
4545
4546  enum {
4547    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
4548    OP_VREV,
4549    OP_VDUP0,
4550    OP_VDUP1,
4551    OP_VDUP2,
4552    OP_VDUP3,
4553    OP_VEXT1,
4554    OP_VEXT2,
4555    OP_VEXT3,
4556    OP_VUZPL, // VUZP, left result
4557    OP_VUZPR, // VUZP, right result
4558    OP_VZIPL, // VZIP, left result
4559    OP_VZIPR, // VZIP, right result
4560    OP_VTRNL, // VTRN, left result
4561    OP_VTRNR  // VTRN, right result
4562  };
4563
4564  if (OpNum == OP_COPY) {
4565    if (LHSID == (1 * 9 + 2) * 9 + 3)
4566      return LHS;
4567    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
4568    return RHS;
4569  }
4570
4571  SDValue OpLHS, OpRHS;
4572  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
4573  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
4574  EVT VT = OpLHS.getValueType();
4575
4576  switch (OpNum) {
4577  default:
4578    llvm_unreachable("Unknown shuffle opcode!");
4579  case OP_VREV:
4580    // VREV divides the vector in half and swaps within the half.
4581    if (VT.getVectorElementType() == MVT::i32 ||
4582        VT.getVectorElementType() == MVT::f32)
4583      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
4584    // vrev <4 x i16> -> REV32
4585    if (VT.getVectorElementType() == MVT::i16)
4586      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
4587    // vrev <4 x i8> -> REV16
4588    assert(VT.getVectorElementType() == MVT::i8);
4589    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
4590  case OP_VDUP0:
4591  case OP_VDUP1:
4592  case OP_VDUP2:
4593  case OP_VDUP3: {
4594    EVT EltTy = VT.getVectorElementType();
4595    unsigned Opcode;
4596    if (EltTy == MVT::i8)
4597      Opcode = AArch64ISD::DUPLANE8;
4598    else if (EltTy == MVT::i16)
4599      Opcode = AArch64ISD::DUPLANE16;
4600    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
4601      Opcode = AArch64ISD::DUPLANE32;
4602    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
4603      Opcode = AArch64ISD::DUPLANE64;
4604    else
4605      llvm_unreachable("Invalid vector element type?");
4606
4607    if (VT.getSizeInBits() == 64)
4608      OpLHS = WidenVector(OpLHS, DAG);
4609    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
4610    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
4611  }
4612  case OP_VEXT1:
4613  case OP_VEXT2:
4614  case OP_VEXT3: {
4615    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
4616    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
4617                       DAG.getConstant(Imm, MVT::i32));
4618  }
4619  case OP_VUZPL:
4620    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
4621                       OpRHS);
4622  case OP_VUZPR:
4623    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
4624                       OpRHS);
4625  case OP_VZIPL:
4626    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
4627                       OpRHS);
4628  case OP_VZIPR:
4629    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
4630                       OpRHS);
4631  case OP_VTRNL:
4632    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
4633                       OpRHS);
4634  case OP_VTRNR:
4635    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
4636                       OpRHS);
4637  }
4638}
4639
4640static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
4641                           SelectionDAG &DAG) {
4642  // Check to see if we can use the TBL instruction.
4643  SDValue V1 = Op.getOperand(0);
4644  SDValue V2 = Op.getOperand(1);
4645  SDLoc DL(Op);
4646
4647  EVT EltVT = Op.getValueType().getVectorElementType();
4648  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
4649
4650  SmallVector<SDValue, 8> TBLMask;
4651  for (int Val : ShuffleMask) {
4652    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4653      unsigned Offset = Byte + Val * BytesPerElt;
4654      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
4655    }
4656  }
4657
4658  MVT IndexVT = MVT::v8i8;
4659  unsigned IndexLen = 8;
4660  if (Op.getValueType().getSizeInBits() == 128) {
4661    IndexVT = MVT::v16i8;
4662    IndexLen = 16;
4663  }
4664
4665  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
4666  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
4667
4668  SDValue Shuffle;
4669  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
4670    if (IndexLen == 8)
4671      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
4672    Shuffle = DAG.getNode(
4673        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
4674        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
4675        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
4676                    makeArrayRef(TBLMask.data(), IndexLen)));
4677  } else {
4678    if (IndexLen == 8) {
4679      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
4680      Shuffle = DAG.getNode(
4681          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
4682          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
4683          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
4684                      makeArrayRef(TBLMask.data(), IndexLen)));
4685    } else {
4686      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
4687      // cannot currently represent the register constraints on the input
4688      // table registers.
4689      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
4690      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
4691      //                               &TBLMask[0], IndexLen));
4692      Shuffle = DAG.getNode(
4693          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
4694          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
4695          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
4696                      makeArrayRef(TBLMask.data(), IndexLen)));
4697    }
4698  }
4699  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
4700}
4701
4702static unsigned getDUPLANEOp(EVT EltType) {
4703  if (EltType == MVT::i8)
4704    return AArch64ISD::DUPLANE8;
4705  if (EltType == MVT::i16)
4706    return AArch64ISD::DUPLANE16;
4707  if (EltType == MVT::i32 || EltType == MVT::f32)
4708    return AArch64ISD::DUPLANE32;
4709  if (EltType == MVT::i64 || EltType == MVT::f64)
4710    return AArch64ISD::DUPLANE64;
4711
4712  llvm_unreachable("Invalid vector element type?");
4713}
4714
4715SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
4716                                                   SelectionDAG &DAG) const {
4717  SDLoc dl(Op);
4718  EVT VT = Op.getValueType();
4719
4720  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4721
4722  // Convert shuffles that are directly supported on NEON to target-specific
4723  // DAG nodes, instead of keeping them as shuffles and matching them again
4724  // during code selection.  This is more efficient and avoids the possibility
4725  // of inconsistencies between legalization and selection.
4726  ArrayRef<int> ShuffleMask = SVN->getMask();
4727
4728  SDValue V1 = Op.getOperand(0);
4729  SDValue V2 = Op.getOperand(1);
4730
4731  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
4732                                       V1.getValueType().getSimpleVT())) {
4733    int Lane = SVN->getSplatIndex();
4734    // If this is undef splat, generate it via "just" vdup, if possible.
4735    if (Lane == -1)
4736      Lane = 0;
4737
4738    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
4739      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
4740                         V1.getOperand(0));
4741    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
4742    // constant. If so, we can just reference the lane's definition directly.
4743    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
4744        !isa<ConstantSDNode>(V1.getOperand(Lane)))
4745      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
4746
4747    // Otherwise, duplicate from the lane of the input vector.
4748    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
4749
4750    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
4751    // to make a vector of the same size as this SHUFFLE. We can ignore the
4752    // extract entirely, and canonicalise the concat using WidenVector.
4753    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4754      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
4755      V1 = V1.getOperand(0);
4756    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
4757      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
4758      Lane -= Idx * VT.getVectorNumElements() / 2;
4759      V1 = WidenVector(V1.getOperand(Idx), DAG);
4760    } else if (VT.getSizeInBits() == 64)
4761      V1 = WidenVector(V1, DAG);
4762
4763    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
4764  }
4765
4766  if (isREVMask(ShuffleMask, VT, 64))
4767    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
4768  if (isREVMask(ShuffleMask, VT, 32))
4769    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
4770  if (isREVMask(ShuffleMask, VT, 16))
4771    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
4772
4773  bool ReverseEXT = false;
4774  unsigned Imm;
4775  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
4776    if (ReverseEXT)
4777      std::swap(V1, V2);
4778    Imm *= getExtFactor(V1);
4779    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
4780                       DAG.getConstant(Imm, MVT::i32));
4781  } else if (V2->getOpcode() == ISD::UNDEF &&
4782             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
4783    Imm *= getExtFactor(V1);
4784    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
4785                       DAG.getConstant(Imm, MVT::i32));
4786  }
4787
4788  unsigned WhichResult;
4789  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
4790    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
4791    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
4792  }
4793  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
4794    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
4795    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
4796  }
4797  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
4798    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
4799    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
4800  }
4801
4802  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
4803    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
4804    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
4805  }
4806  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
4807    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
4808    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
4809  }
4810  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
4811    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
4812    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
4813  }
4814
4815  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
4816  if (Concat.getNode())
4817    return Concat;
4818
4819  bool DstIsLeft;
4820  int Anomaly;
4821  int NumInputElements = V1.getValueType().getVectorNumElements();
4822  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
4823    SDValue DstVec = DstIsLeft ? V1 : V2;
4824    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
4825
4826    SDValue SrcVec = V1;
4827    int SrcLane = ShuffleMask[Anomaly];
4828    if (SrcLane >= NumInputElements) {
4829      SrcVec = V2;
4830      SrcLane -= VT.getVectorNumElements();
4831    }
4832    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
4833
4834    EVT ScalarVT = VT.getVectorElementType();
4835    if (ScalarVT.getSizeInBits() < 32)
4836      ScalarVT = MVT::i32;
4837
4838    return DAG.getNode(
4839        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
4840        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
4841        DstLaneV);
4842  }
4843
4844  // If the shuffle is not directly supported and it has 4 elements, use
4845  // the PerfectShuffle-generated table to synthesize it from other shuffles.
4846  unsigned NumElts = VT.getVectorNumElements();
4847  if (NumElts == 4) {
4848    unsigned PFIndexes[4];
4849    for (unsigned i = 0; i != 4; ++i) {
4850      if (ShuffleMask[i] < 0)
4851        PFIndexes[i] = 8;
4852      else
4853        PFIndexes[i] = ShuffleMask[i];
4854    }
4855
4856    // Compute the index in the perfect shuffle table.
4857    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
4858                            PFIndexes[2] * 9 + PFIndexes[3];
4859    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4860    unsigned Cost = (PFEntry >> 30);
4861
4862    if (Cost <= 4)
4863      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
4864  }
4865
4866  return GenerateTBL(Op, ShuffleMask, DAG);
4867}
4868
4869static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
4870                               APInt &UndefBits) {
4871  EVT VT = BVN->getValueType(0);
4872  APInt SplatBits, SplatUndef;
4873  unsigned SplatBitSize;
4874  bool HasAnyUndefs;
4875  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4876    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
4877
4878    for (unsigned i = 0; i < NumSplats; ++i) {
4879      CnstBits <<= SplatBitSize;
4880      UndefBits <<= SplatBitSize;
4881      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
4882      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
4883    }
4884
4885    return true;
4886  }
4887
4888  return false;
4889}
4890
4891SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
4892                                              SelectionDAG &DAG) const {
4893  BuildVectorSDNode *BVN =
4894      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
4895  SDValue LHS = Op.getOperand(0);
4896  SDLoc dl(Op);
4897  EVT VT = Op.getValueType();
4898
4899  if (!BVN)
4900    return Op;
4901
4902  APInt CnstBits(VT.getSizeInBits(), 0);
4903  APInt UndefBits(VT.getSizeInBits(), 0);
4904  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
4905    // We only have BIC vector immediate instruction, which is and-not.
4906    CnstBits = ~CnstBits;
4907
4908    // We make use of a little bit of goto ickiness in order to avoid having to
4909    // duplicate the immediate matching logic for the undef toggled case.
4910    bool SecondTry = false;
4911  AttemptModImm:
4912
4913    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
4914      CnstBits = CnstBits.zextOrTrunc(64);
4915      uint64_t CnstVal = CnstBits.getZExtValue();
4916
4917      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
4918        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
4919        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
4920        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4921                                  DAG.getConstant(CnstVal, MVT::i32),
4922                                  DAG.getConstant(0, MVT::i32));
4923        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4924      }
4925
4926      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
4927        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
4928        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
4929        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4930                                  DAG.getConstant(CnstVal, MVT::i32),
4931                                  DAG.getConstant(8, MVT::i32));
4932        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4933      }
4934
4935      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
4936        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
4937        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
4938        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4939                                  DAG.getConstant(CnstVal, MVT::i32),
4940                                  DAG.getConstant(16, MVT::i32));
4941        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4942      }
4943
4944      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
4945        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
4946        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
4947        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4948                                  DAG.getConstant(CnstVal, MVT::i32),
4949                                  DAG.getConstant(24, MVT::i32));
4950        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4951      }
4952
4953      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
4954        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
4955        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
4956        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4957                                  DAG.getConstant(CnstVal, MVT::i32),
4958                                  DAG.getConstant(0, MVT::i32));
4959        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4960      }
4961
4962      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
4963        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
4964        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
4965        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
4966                                  DAG.getConstant(CnstVal, MVT::i32),
4967                                  DAG.getConstant(8, MVT::i32));
4968        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
4969      }
4970    }
4971
4972    if (SecondTry)
4973      goto FailedModImm;
4974    SecondTry = true;
4975    CnstBits = ~UndefBits;
4976    goto AttemptModImm;
4977  }
4978
4979// We can always fall back to a non-immediate AND.
4980FailedModImm:
4981  return Op;
4982}
4983
4984// Specialized code to quickly find if PotentialBVec is a BuildVector that
4985// consists of only the same constant int value, returned in reference arg
4986// ConstVal
4987static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
4988                                     uint64_t &ConstVal) {
4989  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
4990  if (!Bvec)
4991    return false;
4992  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
4993  if (!FirstElt)
4994    return false;
4995  EVT VT = Bvec->getValueType(0);
4996  unsigned NumElts = VT.getVectorNumElements();
4997  for (unsigned i = 1; i < NumElts; ++i)
4998    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
4999      return false;
5000  ConstVal = FirstElt->getZExtValue();
5001  return true;
5002}
5003
5004static unsigned getIntrinsicID(const SDNode *N) {
5005  unsigned Opcode = N->getOpcode();
5006  switch (Opcode) {
5007  default:
5008    return Intrinsic::not_intrinsic;
5009  case ISD::INTRINSIC_WO_CHAIN: {
5010    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5011    if (IID < Intrinsic::num_intrinsics)
5012      return IID;
5013    return Intrinsic::not_intrinsic;
5014  }
5015  }
5016}
5017
5018// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
5019// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
5020// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
5021// Also, logical shift right -> sri, with the same structure.
5022static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
5023  EVT VT = N->getValueType(0);
5024
5025  if (!VT.isVector())
5026    return SDValue();
5027
5028  SDLoc DL(N);
5029
5030  // Is the first op an AND?
5031  const SDValue And = N->getOperand(0);
5032  if (And.getOpcode() != ISD::AND)
5033    return SDValue();
5034
5035  // Is the second op an shl or lshr?
5036  SDValue Shift = N->getOperand(1);
5037  // This will have been turned into: AArch64ISD::VSHL vector, #shift
5038  // or AArch64ISD::VLSHR vector, #shift
5039  unsigned ShiftOpc = Shift.getOpcode();
5040  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
5041    return SDValue();
5042  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
5043
5044  // Is the shift amount constant?
5045  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
5046  if (!C2node)
5047    return SDValue();
5048
5049  // Is the and mask vector all constant?
5050  uint64_t C1;
5051  if (!isAllConstantBuildVector(And.getOperand(1), C1))
5052    return SDValue();
5053
5054  // Is C1 == ~C2, taking into account how much one can shift elements of a
5055  // particular size?
5056  uint64_t C2 = C2node->getZExtValue();
5057  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
5058  if (C2 > ElemSizeInBits)
5059    return SDValue();
5060  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
5061  if ((C1 & ElemMask) != (~C2 & ElemMask))
5062    return SDValue();
5063
5064  SDValue X = And.getOperand(0);
5065  SDValue Y = Shift.getOperand(0);
5066
5067  unsigned Intrin =
5068      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
5069  SDValue ResultSLI =
5070      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
5071                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
5072
5073  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
5074  DEBUG(N->dump(&DAG));
5075  DEBUG(dbgs() << "into: \n");
5076  DEBUG(ResultSLI->dump(&DAG));
5077
5078  ++NumShiftInserts;
5079  return ResultSLI;
5080}
5081
5082SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
5083                                             SelectionDAG &DAG) const {
5084  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
5085  if (EnableAArch64SlrGeneration) {
5086    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
5087    if (Res.getNode())
5088      return Res;
5089  }
5090
5091  BuildVectorSDNode *BVN =
5092      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
5093  SDValue LHS = Op.getOperand(1);
5094  SDLoc dl(Op);
5095  EVT VT = Op.getValueType();
5096
5097  // OR commutes, so try swapping the operands.
5098  if (!BVN) {
5099    LHS = Op.getOperand(0);
5100    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
5101  }
5102  if (!BVN)
5103    return Op;
5104
5105  APInt CnstBits(VT.getSizeInBits(), 0);
5106  APInt UndefBits(VT.getSizeInBits(), 0);
5107  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
5108    // We make use of a little bit of goto ickiness in order to avoid having to
5109    // duplicate the immediate matching logic for the undef toggled case.
5110    bool SecondTry = false;
5111  AttemptModImm:
5112
5113    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
5114      CnstBits = CnstBits.zextOrTrunc(64);
5115      uint64_t CnstVal = CnstBits.getZExtValue();
5116
5117      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
5118        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
5119        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5120        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5121                                  DAG.getConstant(CnstVal, MVT::i32),
5122                                  DAG.getConstant(0, MVT::i32));
5123        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5124      }
5125
5126      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
5127        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
5128        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5129        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5130                                  DAG.getConstant(CnstVal, MVT::i32),
5131                                  DAG.getConstant(8, MVT::i32));
5132        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5133      }
5134
5135      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
5136        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
5137        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5138        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5139                                  DAG.getConstant(CnstVal, MVT::i32),
5140                                  DAG.getConstant(16, MVT::i32));
5141        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5142      }
5143
5144      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
5145        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
5146        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5147        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5148                                  DAG.getConstant(CnstVal, MVT::i32),
5149                                  DAG.getConstant(24, MVT::i32));
5150        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5151      }
5152
5153      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
5154        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
5155        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5156        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5157                                  DAG.getConstant(CnstVal, MVT::i32),
5158                                  DAG.getConstant(0, MVT::i32));
5159        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5160      }
5161
5162      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
5163        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
5164        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5165        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
5166                                  DAG.getConstant(CnstVal, MVT::i32),
5167                                  DAG.getConstant(8, MVT::i32));
5168        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5169      }
5170    }
5171
5172    if (SecondTry)
5173      goto FailedModImm;
5174    SecondTry = true;
5175    CnstBits = UndefBits;
5176    goto AttemptModImm;
5177  }
5178
5179// We can always fall back to a non-immediate OR.
5180FailedModImm:
5181  return Op;
5182}
5183
5184// Normalize the operands of BUILD_VECTOR. The value of constant operands will
5185// be truncated to fit element width.
5186static SDValue NormalizeBuildVector(SDValue Op,
5187                                    SelectionDAG &DAG) {
5188  assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
5189  SDLoc dl(Op);
5190  EVT VT = Op.getValueType();
5191  EVT EltTy= VT.getVectorElementType();
5192
5193  if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
5194    return Op;
5195
5196  SmallVector<SDValue, 16> Ops;
5197  for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
5198    SDValue Lane = Op.getOperand(I);
5199    if (Lane.getOpcode() == ISD::Constant) {
5200      APInt LowBits(EltTy.getSizeInBits(),
5201                    cast<ConstantSDNode>(Lane)->getZExtValue());
5202      Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
5203    }
5204    Ops.push_back(Lane);
5205  }
5206  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5207}
5208
5209SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
5210                                                 SelectionDAG &DAG) const {
5211  SDLoc dl(Op);
5212  EVT VT = Op.getValueType();
5213  Op = NormalizeBuildVector(Op, DAG);
5214  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
5215
5216  APInt CnstBits(VT.getSizeInBits(), 0);
5217  APInt UndefBits(VT.getSizeInBits(), 0);
5218  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
5219    // We make use of a little bit of goto ickiness in order to avoid having to
5220    // duplicate the immediate matching logic for the undef toggled case.
5221    bool SecondTry = false;
5222  AttemptModImm:
5223
5224    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
5225      CnstBits = CnstBits.zextOrTrunc(64);
5226      uint64_t CnstVal = CnstBits.getZExtValue();
5227
5228      // Certain magic vector constants (used to express things like NOT
5229      // and NEG) are passed through unmodified.  This allows codegen patterns
5230      // for these operations to match.  Special-purpose patterns will lower
5231      // these immediates to MOVIs if it proves necessary.
5232      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
5233        return Op;
5234
5235      // The many faces of MOVI...
5236      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
5237        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
5238        if (VT.getSizeInBits() == 128) {
5239          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
5240                                    DAG.getConstant(CnstVal, MVT::i32));
5241          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5242        }
5243
5244        // Support the V64 version via subregister insertion.
5245        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
5246                                  DAG.getConstant(CnstVal, MVT::i32));
5247        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5248      }
5249
5250      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
5251        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
5252        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5253        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5254                                  DAG.getConstant(CnstVal, MVT::i32),
5255                                  DAG.getConstant(0, MVT::i32));
5256        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5257      }
5258
5259      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
5260        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
5261        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5262        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5263                                  DAG.getConstant(CnstVal, MVT::i32),
5264                                  DAG.getConstant(8, MVT::i32));
5265        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5266      }
5267
5268      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
5269        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
5270        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5271        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5272                                  DAG.getConstant(CnstVal, MVT::i32),
5273                                  DAG.getConstant(16, MVT::i32));
5274        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5275      }
5276
5277      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
5278        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
5279        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5280        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5281                                  DAG.getConstant(CnstVal, MVT::i32),
5282                                  DAG.getConstant(24, MVT::i32));
5283        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5284      }
5285
5286      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
5287        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
5288        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5289        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5290                                  DAG.getConstant(CnstVal, MVT::i32),
5291                                  DAG.getConstant(0, MVT::i32));
5292        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5293      }
5294
5295      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
5296        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
5297        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5298        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
5299                                  DAG.getConstant(CnstVal, MVT::i32),
5300                                  DAG.getConstant(8, MVT::i32));
5301        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5302      }
5303
5304      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
5305        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
5306        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5307        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
5308                                  DAG.getConstant(CnstVal, MVT::i32),
5309                                  DAG.getConstant(264, MVT::i32));
5310        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5311      }
5312
5313      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
5314        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
5315        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5316        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
5317                                  DAG.getConstant(CnstVal, MVT::i32),
5318                                  DAG.getConstant(272, MVT::i32));
5319        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5320      }
5321
5322      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
5323        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
5324        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
5325        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
5326                                  DAG.getConstant(CnstVal, MVT::i32));
5327        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5328      }
5329
5330      // The few faces of FMOV...
5331      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
5332        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
5333        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
5334        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
5335                                  DAG.getConstant(CnstVal, MVT::i32));
5336        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5337      }
5338
5339      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
5340          VT.getSizeInBits() == 128) {
5341        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
5342        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
5343                                  DAG.getConstant(CnstVal, MVT::i32));
5344        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5345      }
5346
5347      // The many faces of MVNI...
5348      CnstVal = ~CnstVal;
5349      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
5350        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
5351        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5352        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5353                                  DAG.getConstant(CnstVal, MVT::i32),
5354                                  DAG.getConstant(0, MVT::i32));
5355        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5356      }
5357
5358      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
5359        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
5360        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5361        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5362                                  DAG.getConstant(CnstVal, MVT::i32),
5363                                  DAG.getConstant(8, MVT::i32));
5364        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5365      }
5366
5367      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
5368        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
5369        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5370        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5371                                  DAG.getConstant(CnstVal, MVT::i32),
5372                                  DAG.getConstant(16, MVT::i32));
5373        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5374      }
5375
5376      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
5377        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
5378        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5379        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5380                                  DAG.getConstant(CnstVal, MVT::i32),
5381                                  DAG.getConstant(24, MVT::i32));
5382        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5383      }
5384
5385      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
5386        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
5387        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5388        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5389                                  DAG.getConstant(CnstVal, MVT::i32),
5390                                  DAG.getConstant(0, MVT::i32));
5391        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5392      }
5393
5394      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
5395        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
5396        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
5397        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
5398                                  DAG.getConstant(CnstVal, MVT::i32),
5399                                  DAG.getConstant(8, MVT::i32));
5400        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5401      }
5402
5403      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
5404        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
5405        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5406        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
5407                                  DAG.getConstant(CnstVal, MVT::i32),
5408                                  DAG.getConstant(264, MVT::i32));
5409        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5410      }
5411
5412      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
5413        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
5414        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
5415        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
5416                                  DAG.getConstant(CnstVal, MVT::i32),
5417                                  DAG.getConstant(272, MVT::i32));
5418        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
5419      }
5420    }
5421
5422    if (SecondTry)
5423      goto FailedModImm;
5424    SecondTry = true;
5425    CnstBits = UndefBits;
5426    goto AttemptModImm;
5427  }
5428FailedModImm:
5429
5430  // Scan through the operands to find some interesting properties we can
5431  // exploit:
5432  //   1) If only one value is used, we can use a DUP, or
5433  //   2) if only the low element is not undef, we can just insert that, or
5434  //   3) if only one constant value is used (w/ some non-constant lanes),
5435  //      we can splat the constant value into the whole vector then fill
5436  //      in the non-constant lanes.
5437  //   4) FIXME: If different constant values are used, but we can intelligently
5438  //             select the values we'll be overwriting for the non-constant
5439  //             lanes such that we can directly materialize the vector
5440  //             some other way (MOVI, e.g.), we can be sneaky.
5441  unsigned NumElts = VT.getVectorNumElements();
5442  bool isOnlyLowElement = true;
5443  bool usesOnlyOneValue = true;
5444  bool usesOnlyOneConstantValue = true;
5445  bool isConstant = true;
5446  unsigned NumConstantLanes = 0;
5447  SDValue Value;
5448  SDValue ConstantValue;
5449  for (unsigned i = 0; i < NumElts; ++i) {
5450    SDValue V = Op.getOperand(i);
5451    if (V.getOpcode() == ISD::UNDEF)
5452      continue;
5453    if (i > 0)
5454      isOnlyLowElement = false;
5455    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
5456      isConstant = false;
5457
5458    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
5459      ++NumConstantLanes;
5460      if (!ConstantValue.getNode())
5461        ConstantValue = V;
5462      else if (ConstantValue != V)
5463        usesOnlyOneConstantValue = false;
5464    }
5465
5466    if (!Value.getNode())
5467      Value = V;
5468    else if (V != Value)
5469      usesOnlyOneValue = false;
5470  }
5471
5472  if (!Value.getNode())
5473    return DAG.getUNDEF(VT);
5474
5475  if (isOnlyLowElement)
5476    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
5477
5478  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
5479  // i32 and try again.
5480  if (usesOnlyOneValue) {
5481    if (!isConstant) {
5482      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5483          Value.getValueType() != VT)
5484        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
5485
5486      // This is actually a DUPLANExx operation, which keeps everything vectory.
5487
5488      // DUPLANE works on 128-bit vectors, widen it if necessary.
5489      SDValue Lane = Value.getOperand(1);
5490      Value = Value.getOperand(0);
5491      if (Value.getValueType().getSizeInBits() == 64)
5492        Value = WidenVector(Value, DAG);
5493
5494      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
5495      return DAG.getNode(Opcode, dl, VT, Value, Lane);
5496    }
5497
5498    if (VT.getVectorElementType().isFloatingPoint()) {
5499      SmallVector<SDValue, 8> Ops;
5500      MVT NewType =
5501          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
5502      for (unsigned i = 0; i < NumElts; ++i)
5503        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
5504      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
5505      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
5506      Val = LowerBUILD_VECTOR(Val, DAG);
5507      if (Val.getNode())
5508        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5509    }
5510  }
5511
5512  // If there was only one constant value used and for more than one lane,
5513  // start by splatting that value, then replace the non-constant lanes. This
5514  // is better than the default, which will perform a separate initialization
5515  // for each lane.
5516  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
5517    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
5518    // Now insert the non-constant lanes.
5519    for (unsigned i = 0; i < NumElts; ++i) {
5520      SDValue V = Op.getOperand(i);
5521      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
5522      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
5523        // Note that type legalization likely mucked about with the VT of the
5524        // source operand, so we may have to convert it here before inserting.
5525        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
5526      }
5527    }
5528    return Val;
5529  }
5530
5531  // If all elements are constants and the case above didn't get hit, fall back
5532  // to the default expansion, which will generate a load from the constant
5533  // pool.
5534  if (isConstant)
5535    return SDValue();
5536
5537  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
5538  if (NumElts >= 4) {
5539    SDValue shuffle = ReconstructShuffle(Op, DAG);
5540    if (shuffle != SDValue())
5541      return shuffle;
5542  }
5543
5544  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
5545  // know the default expansion would otherwise fall back on something even
5546  // worse. For a vector with one or two non-undef values, that's
5547  // scalar_to_vector for the elements followed by a shuffle (provided the
5548  // shuffle is valid for the target) and materialization element by element
5549  // on the stack followed by a load for everything else.
5550  if (!isConstant && !usesOnlyOneValue) {
5551    SDValue Vec = DAG.getUNDEF(VT);
5552    SDValue Op0 = Op.getOperand(0);
5553    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
5554    unsigned i = 0;
5555    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
5556    // a) Avoid a RMW dependency on the full vector register, and
5557    // b) Allow the register coalescer to fold away the copy if the
5558    //    value is already in an S or D register.
5559    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
5560      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
5561      MachineSDNode *N =
5562          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
5563                             DAG.getTargetConstant(SubIdx, MVT::i32));
5564      Vec = SDValue(N, 0);
5565      ++i;
5566    }
5567    for (; i < NumElts; ++i) {
5568      SDValue V = Op.getOperand(i);
5569      if (V.getOpcode() == ISD::UNDEF)
5570        continue;
5571      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
5572      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
5573    }
5574    return Vec;
5575  }
5576
5577  // Just use the default expansion. We failed to find a better alternative.
5578  return SDValue();
5579}
5580
5581SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
5582                                                      SelectionDAG &DAG) const {
5583  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
5584
5585  // Check for non-constant lane.
5586  if (!isa<ConstantSDNode>(Op.getOperand(2)))
5587    return SDValue();
5588
5589  EVT VT = Op.getOperand(0).getValueType();
5590
5591  // Insertion/extraction are legal for V128 types.
5592  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
5593      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
5594    return Op;
5595
5596  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
5597      VT != MVT::v1i64 && VT != MVT::v2f32)
5598    return SDValue();
5599
5600  // For V64 types, we perform insertion by expanding the value
5601  // to a V128 type and perform the insertion on that.
5602  SDLoc DL(Op);
5603  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
5604  EVT WideTy = WideVec.getValueType();
5605
5606  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
5607                             Op.getOperand(1), Op.getOperand(2));
5608  // Re-narrow the resultant vector.
5609  return NarrowVector(Node, DAG);
5610}
5611
5612SDValue
5613AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
5614                                               SelectionDAG &DAG) const {
5615  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
5616
5617  // Check for non-constant lane.
5618  if (!isa<ConstantSDNode>(Op.getOperand(1)))
5619    return SDValue();
5620
5621  EVT VT = Op.getOperand(0).getValueType();
5622
5623  // Insertion/extraction are legal for V128 types.
5624  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
5625      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
5626    return Op;
5627
5628  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
5629      VT != MVT::v1i64 && VT != MVT::v2f32)
5630    return SDValue();
5631
5632  // For V64 types, we perform extraction by expanding the value
5633  // to a V128 type and perform the extraction on that.
5634  SDLoc DL(Op);
5635  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
5636  EVT WideTy = WideVec.getValueType();
5637
5638  EVT ExtrTy = WideTy.getVectorElementType();
5639  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
5640    ExtrTy = MVT::i32;
5641
5642  // For extractions, we just return the result directly.
5643  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
5644                     Op.getOperand(1));
5645}
5646
5647SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
5648                                                      SelectionDAG &DAG) const {
5649  EVT VT = Op.getOperand(0).getValueType();
5650  SDLoc dl(Op);
5651  // Just in case...
5652  if (!VT.isVector())
5653    return SDValue();
5654
5655  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5656  if (!Cst)
5657    return SDValue();
5658  unsigned Val = Cst->getZExtValue();
5659
5660  unsigned Size = Op.getValueType().getSizeInBits();
5661  if (Val == 0) {
5662    switch (Size) {
5663    case 8:
5664      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
5665                                        Op.getOperand(0));
5666    case 16:
5667      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
5668                                        Op.getOperand(0));
5669    case 32:
5670      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
5671                                        Op.getOperand(0));
5672    case 64:
5673      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
5674                                        Op.getOperand(0));
5675    default:
5676      llvm_unreachable("Unexpected vector type in extract_subvector!");
5677    }
5678  }
5679  // If this is extracting the upper 64-bits of a 128-bit vector, we match
5680  // that directly.
5681  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
5682    return Op;
5683
5684  return SDValue();
5685}
5686
5687bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
5688                                               EVT VT) const {
5689  if (VT.getVectorNumElements() == 4 &&
5690      (VT.is128BitVector() || VT.is64BitVector())) {
5691    unsigned PFIndexes[4];
5692    for (unsigned i = 0; i != 4; ++i) {
5693      if (M[i] < 0)
5694        PFIndexes[i] = 8;
5695      else
5696        PFIndexes[i] = M[i];
5697    }
5698
5699    // Compute the index in the perfect shuffle table.
5700    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
5701                            PFIndexes[2] * 9 + PFIndexes[3];
5702    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5703    unsigned Cost = (PFEntry >> 30);
5704
5705    if (Cost <= 4)
5706      return true;
5707  }
5708
5709  bool DummyBool;
5710  int DummyInt;
5711  unsigned DummyUnsigned;
5712
5713  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
5714          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
5715          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
5716          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
5717          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
5718          isZIPMask(M, VT, DummyUnsigned) ||
5719          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
5720          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
5721          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
5722          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
5723          isConcatMask(M, VT, VT.getSizeInBits() == 128));
5724}
5725
5726/// getVShiftImm - Check if this is a valid build_vector for the immediate
5727/// operand of a vector shift operation, where all the elements of the
5728/// build_vector must have the same constant integer value.
5729static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
5730  // Ignore bit_converts.
5731  while (Op.getOpcode() == ISD::BITCAST)
5732    Op = Op.getOperand(0);
5733  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
5734  APInt SplatBits, SplatUndef;
5735  unsigned SplatBitSize;
5736  bool HasAnyUndefs;
5737  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
5738                                    HasAnyUndefs, ElementBits) ||
5739      SplatBitSize > ElementBits)
5740    return false;
5741  Cnt = SplatBits.getSExtValue();
5742  return true;
5743}
5744
5745/// isVShiftLImm - Check if this is a valid build_vector for the immediate
5746/// operand of a vector shift left operation.  That value must be in the range:
5747///   0 <= Value < ElementBits for a left shift; or
5748///   0 <= Value <= ElementBits for a long left shift.
5749static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
5750  assert(VT.isVector() && "vector shift count is not a vector type");
5751  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
5752  if (!getVShiftImm(Op, ElementBits, Cnt))
5753    return false;
5754  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
5755}
5756
5757/// isVShiftRImm - Check if this is a valid build_vector for the immediate
5758/// operand of a vector shift right operation.  For a shift opcode, the value
5759/// is positive, but for an intrinsic the value count must be negative. The
5760/// absolute value must be in the range:
5761///   1 <= |Value| <= ElementBits for a right shift; or
5762///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
5763static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
5764                         int64_t &Cnt) {
5765  assert(VT.isVector() && "vector shift count is not a vector type");
5766  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
5767  if (!getVShiftImm(Op, ElementBits, Cnt))
5768    return false;
5769  if (isIntrinsic)
5770    Cnt = -Cnt;
5771  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
5772}
5773
5774SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
5775                                                      SelectionDAG &DAG) const {
5776  EVT VT = Op.getValueType();
5777  SDLoc DL(Op);
5778  int64_t Cnt;
5779
5780  if (!Op.getOperand(1).getValueType().isVector())
5781    return Op;
5782  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5783
5784  switch (Op.getOpcode()) {
5785  default:
5786    llvm_unreachable("unexpected shift opcode");
5787
5788  case ISD::SHL:
5789    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
5790      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
5791                         DAG.getConstant(Cnt, MVT::i32));
5792    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
5793                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
5794                       Op.getOperand(0), Op.getOperand(1));
5795  case ISD::SRA:
5796  case ISD::SRL:
5797    // Right shift immediate
5798    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
5799        Cnt < EltSize) {
5800      unsigned Opc =
5801          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
5802      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
5803                         DAG.getConstant(Cnt, MVT::i32));
5804    }
5805
5806    // Right shift register.  Note, there is not a shift right register
5807    // instruction, but the shift left register instruction takes a signed
5808    // value, where negative numbers specify a right shift.
5809    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
5810                                                : Intrinsic::aarch64_neon_ushl;
5811    // negate the shift amount
5812    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
5813    SDValue NegShiftLeft =
5814        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
5815                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
5816    return NegShiftLeft;
5817  }
5818
5819  return SDValue();
5820}
5821
5822static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
5823                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
5824                                    SDLoc dl, SelectionDAG &DAG) {
5825  EVT SrcVT = LHS.getValueType();
5826
5827  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
5828  APInt CnstBits(VT.getSizeInBits(), 0);
5829  APInt UndefBits(VT.getSizeInBits(), 0);
5830  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
5831  bool IsZero = IsCnst && (CnstBits == 0);
5832
5833  if (SrcVT.getVectorElementType().isFloatingPoint()) {
5834    switch (CC) {
5835    default:
5836      return SDValue();
5837    case AArch64CC::NE: {
5838      SDValue Fcmeq;
5839      if (IsZero)
5840        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
5841      else
5842        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
5843      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
5844    }
5845    case AArch64CC::EQ:
5846      if (IsZero)
5847        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
5848      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
5849    case AArch64CC::GE:
5850      if (IsZero)
5851        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
5852      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
5853    case AArch64CC::GT:
5854      if (IsZero)
5855        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
5856      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
5857    case AArch64CC::LS:
5858      if (IsZero)
5859        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
5860      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
5861    case AArch64CC::LT:
5862      if (!NoNans)
5863        return SDValue();
5864    // If we ignore NaNs then we can use to the MI implementation.
5865    // Fallthrough.
5866    case AArch64CC::MI:
5867      if (IsZero)
5868        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
5869      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
5870    }
5871  }
5872
5873  switch (CC) {
5874  default:
5875    return SDValue();
5876  case AArch64CC::NE: {
5877    SDValue Cmeq;
5878    if (IsZero)
5879      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
5880    else
5881      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
5882    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
5883  }
5884  case AArch64CC::EQ:
5885    if (IsZero)
5886      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
5887    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
5888  case AArch64CC::GE:
5889    if (IsZero)
5890      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
5891    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
5892  case AArch64CC::GT:
5893    if (IsZero)
5894      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
5895    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
5896  case AArch64CC::LE:
5897    if (IsZero)
5898      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
5899    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
5900  case AArch64CC::LS:
5901    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
5902  case AArch64CC::LO:
5903    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
5904  case AArch64CC::LT:
5905    if (IsZero)
5906      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
5907    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
5908  case AArch64CC::HI:
5909    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
5910  case AArch64CC::HS:
5911    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
5912  }
5913}
5914
5915SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
5916                                           SelectionDAG &DAG) const {
5917  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5918  SDValue LHS = Op.getOperand(0);
5919  SDValue RHS = Op.getOperand(1);
5920  SDLoc dl(Op);
5921
5922  if (LHS.getValueType().getVectorElementType().isInteger()) {
5923    assert(LHS.getValueType() == RHS.getValueType());
5924    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
5925    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
5926                                dl, DAG);
5927  }
5928
5929  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
5930         LHS.getValueType().getVectorElementType() == MVT::f64);
5931
5932  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
5933  // clean.  Some of them require two branches to implement.
5934  AArch64CC::CondCode CC1, CC2;
5935  bool ShouldInvert;
5936  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
5937
5938  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
5939  SDValue Cmp =
5940      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
5941  if (!Cmp.getNode())
5942    return SDValue();
5943
5944  if (CC2 != AArch64CC::AL) {
5945    SDValue Cmp2 =
5946        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
5947    if (!Cmp2.getNode())
5948      return SDValue();
5949
5950    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
5951  }
5952
5953  if (ShouldInvert)
5954    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
5955
5956  return Cmp;
5957}
5958
5959/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
5960/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
5961/// specified in the intrinsic calls.
5962bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5963                                               const CallInst &I,
5964                                               unsigned Intrinsic) const {
5965  switch (Intrinsic) {
5966  case Intrinsic::aarch64_neon_ld2:
5967  case Intrinsic::aarch64_neon_ld3:
5968  case Intrinsic::aarch64_neon_ld4:
5969  case Intrinsic::aarch64_neon_ld1x2:
5970  case Intrinsic::aarch64_neon_ld1x3:
5971  case Intrinsic::aarch64_neon_ld1x4:
5972  case Intrinsic::aarch64_neon_ld2lane:
5973  case Intrinsic::aarch64_neon_ld3lane:
5974  case Intrinsic::aarch64_neon_ld4lane:
5975  case Intrinsic::aarch64_neon_ld2r:
5976  case Intrinsic::aarch64_neon_ld3r:
5977  case Intrinsic::aarch64_neon_ld4r: {
5978    Info.opc = ISD::INTRINSIC_W_CHAIN;
5979    // Conservatively set memVT to the entire set of vectors loaded.
5980    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
5981    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
5982    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
5983    Info.offset = 0;
5984    Info.align = 0;
5985    Info.vol = false; // volatile loads with NEON intrinsics not supported
5986    Info.readMem = true;
5987    Info.writeMem = false;
5988    return true;
5989  }
5990  case Intrinsic::aarch64_neon_st2:
5991  case Intrinsic::aarch64_neon_st3:
5992  case Intrinsic::aarch64_neon_st4:
5993  case Intrinsic::aarch64_neon_st1x2:
5994  case Intrinsic::aarch64_neon_st1x3:
5995  case Intrinsic::aarch64_neon_st1x4:
5996  case Intrinsic::aarch64_neon_st2lane:
5997  case Intrinsic::aarch64_neon_st3lane:
5998  case Intrinsic::aarch64_neon_st4lane: {
5999    Info.opc = ISD::INTRINSIC_VOID;
6000    // Conservatively set memVT to the entire set of vectors stored.
6001    unsigned NumElts = 0;
6002    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
6003      Type *ArgTy = I.getArgOperand(ArgI)->getType();
6004      if (!ArgTy->isVectorTy())
6005        break;
6006      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
6007    }
6008    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
6009    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
6010    Info.offset = 0;
6011    Info.align = 0;
6012    Info.vol = false; // volatile stores with NEON intrinsics not supported
6013    Info.readMem = false;
6014    Info.writeMem = true;
6015    return true;
6016  }
6017  case Intrinsic::aarch64_ldaxr:
6018  case Intrinsic::aarch64_ldxr: {
6019    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
6020    Info.opc = ISD::INTRINSIC_W_CHAIN;
6021    Info.memVT = MVT::getVT(PtrTy->getElementType());
6022    Info.ptrVal = I.getArgOperand(0);
6023    Info.offset = 0;
6024    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
6025    Info.vol = true;
6026    Info.readMem = true;
6027    Info.writeMem = false;
6028    return true;
6029  }
6030  case Intrinsic::aarch64_stlxr:
6031  case Intrinsic::aarch64_stxr: {
6032    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
6033    Info.opc = ISD::INTRINSIC_W_CHAIN;
6034    Info.memVT = MVT::getVT(PtrTy->getElementType());
6035    Info.ptrVal = I.getArgOperand(1);
6036    Info.offset = 0;
6037    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
6038    Info.vol = true;
6039    Info.readMem = false;
6040    Info.writeMem = true;
6041    return true;
6042  }
6043  case Intrinsic::aarch64_ldaxp:
6044  case Intrinsic::aarch64_ldxp: {
6045    Info.opc = ISD::INTRINSIC_W_CHAIN;
6046    Info.memVT = MVT::i128;
6047    Info.ptrVal = I.getArgOperand(0);
6048    Info.offset = 0;
6049    Info.align = 16;
6050    Info.vol = true;
6051    Info.readMem = true;
6052    Info.writeMem = false;
6053    return true;
6054  }
6055  case Intrinsic::aarch64_stlxp:
6056  case Intrinsic::aarch64_stxp: {
6057    Info.opc = ISD::INTRINSIC_W_CHAIN;
6058    Info.memVT = MVT::i128;
6059    Info.ptrVal = I.getArgOperand(2);
6060    Info.offset = 0;
6061    Info.align = 16;
6062    Info.vol = true;
6063    Info.readMem = false;
6064    Info.writeMem = true;
6065    return true;
6066  }
6067  default:
6068    break;
6069  }
6070
6071  return false;
6072}
6073
6074// Truncations from 64-bit GPR to 32-bit GPR is free.
6075bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
6076  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
6077    return false;
6078  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
6079  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
6080  return NumBits1 > NumBits2;
6081}
6082bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
6083  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
6084    return false;
6085  unsigned NumBits1 = VT1.getSizeInBits();
6086  unsigned NumBits2 = VT2.getSizeInBits();
6087  return NumBits1 > NumBits2;
6088}
6089
6090// All 32-bit GPR operations implicitly zero the high-half of the corresponding
6091// 64-bit GPR.
6092bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
6093  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
6094    return false;
6095  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
6096  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
6097  return NumBits1 == 32 && NumBits2 == 64;
6098}
6099bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
6100  if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
6101    return false;
6102  unsigned NumBits1 = VT1.getSizeInBits();
6103  unsigned NumBits2 = VT2.getSizeInBits();
6104  return NumBits1 == 32 && NumBits2 == 64;
6105}
6106
6107bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
6108  EVT VT1 = Val.getValueType();
6109  if (isZExtFree(VT1, VT2)) {
6110    return true;
6111  }
6112
6113  if (Val.getOpcode() != ISD::LOAD)
6114    return false;
6115
6116  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
6117  return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
6118          VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
6119          VT1.getSizeInBits() <= 32);
6120}
6121
6122bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
6123                                          unsigned &RequiredAligment) const {
6124  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
6125    return false;
6126  // Cyclone supports unaligned accesses.
6127  RequiredAligment = 0;
6128  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
6129  return NumBits == 32 || NumBits == 64;
6130}
6131
6132bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
6133                                          unsigned &RequiredAligment) const {
6134  if (!LoadedType.isSimple() ||
6135      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
6136    return false;
6137  // Cyclone supports unaligned accesses.
6138  RequiredAligment = 0;
6139  unsigned NumBits = LoadedType.getSizeInBits();
6140  return NumBits == 32 || NumBits == 64;
6141}
6142
6143static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
6144                       unsigned AlignCheck) {
6145  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
6146          (DstAlign == 0 || DstAlign % AlignCheck == 0));
6147}
6148
6149EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
6150                                               unsigned SrcAlign, bool IsMemset,
6151                                               bool ZeroMemset,
6152                                               bool MemcpyStrSrc,
6153                                               MachineFunction &MF) const {
6154  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
6155  // instruction to materialize the v2i64 zero and one store (with restrictive
6156  // addressing mode). Just do two i64 store of zero-registers.
6157  bool Fast;
6158  const Function *F = MF.getFunction();
6159  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
6160      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
6161                                       Attribute::NoImplicitFloat) &&
6162      (memOpAlign(SrcAlign, DstAlign, 16) ||
6163       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
6164    return MVT::f128;
6165
6166  return Size >= 8 ? MVT::i64 : MVT::i32;
6167}
6168
6169// 12-bit optionally shifted immediates are legal for adds.
6170bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
6171  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
6172    return true;
6173  return false;
6174}
6175
6176// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
6177// immediates is the same as for an add or a sub.
6178bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
6179  if (Immed < 0)
6180    Immed *= -1;
6181  return isLegalAddImmediate(Immed);
6182}
6183
6184/// isLegalAddressingMode - Return true if the addressing mode represented
6185/// by AM is legal for this target, for a load/store of the specified type.
6186bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
6187                                                  Type *Ty) const {
6188  // AArch64 has five basic addressing modes:
6189  //  reg
6190  //  reg + 9-bit signed offset
6191  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
6192  //  reg1 + reg2
6193  //  reg + SIZE_IN_BYTES * reg
6194
6195  // No global is ever allowed as a base.
6196  if (AM.BaseGV)
6197    return false;
6198
6199  // No reg+reg+imm addressing.
6200  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
6201    return false;
6202
6203  // check reg + imm case:
6204  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
6205  uint64_t NumBytes = 0;
6206  if (Ty->isSized()) {
6207    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
6208    NumBytes = NumBits / 8;
6209    if (!isPowerOf2_64(NumBits))
6210      NumBytes = 0;
6211  }
6212
6213  if (!AM.Scale) {
6214    int64_t Offset = AM.BaseOffs;
6215
6216    // 9-bit signed offset
6217    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
6218      return true;
6219
6220    // 12-bit unsigned offset
6221    unsigned shift = Log2_64(NumBytes);
6222    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
6223        // Must be a multiple of NumBytes (NumBytes is a power of 2)
6224        (Offset >> shift) << shift == Offset)
6225      return true;
6226    return false;
6227  }
6228
6229  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
6230
6231  if (!AM.Scale || AM.Scale == 1 ||
6232      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
6233    return true;
6234  return false;
6235}
6236
6237int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
6238                                                Type *Ty) const {
6239  // Scaling factors are not free at all.
6240  // Operands                     | Rt Latency
6241  // -------------------------------------------
6242  // Rt, [Xn, Xm]                 | 4
6243  // -------------------------------------------
6244  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
6245  // Rt, [Xn, Wm, <extend> #imm]  |
6246  if (isLegalAddressingMode(AM, Ty))
6247    // Scale represents reg2 * scale, thus account for 1 if
6248    // it is not equal to 0 or 1.
6249    return AM.Scale != 0 && AM.Scale != 1;
6250  return -1;
6251}
6252
6253bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
6254  VT = VT.getScalarType();
6255
6256  if (!VT.isSimple())
6257    return false;
6258
6259  switch (VT.getSimpleVT().SimpleTy) {
6260  case MVT::f32:
6261  case MVT::f64:
6262    return true;
6263  default:
6264    break;
6265  }
6266
6267  return false;
6268}
6269
6270const MCPhysReg *
6271AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
6272  // LR is a callee-save register, but we must treat it as clobbered by any call
6273  // site. Hence we include LR in the scratch registers, which are in turn added
6274  // as implicit-defs for stackmaps and patchpoints.
6275  static const MCPhysReg ScratchRegs[] = {
6276    AArch64::X16, AArch64::X17, AArch64::LR, 0
6277  };
6278  return ScratchRegs;
6279}
6280
6281bool
6282AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
6283  EVT VT = N->getValueType(0);
6284    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
6285    // it with shift to let it be lowered to UBFX.
6286  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
6287      isa<ConstantSDNode>(N->getOperand(1))) {
6288    uint64_t TruncMask = N->getConstantOperandVal(1);
6289    if (isMask_64(TruncMask) &&
6290      N->getOperand(0).getOpcode() == ISD::SRL &&
6291      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
6292      return false;
6293  }
6294  return true;
6295}
6296
6297bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
6298                                                              Type *Ty) const {
6299  assert(Ty->isIntegerTy());
6300
6301  unsigned BitSize = Ty->getPrimitiveSizeInBits();
6302  if (BitSize == 0)
6303    return false;
6304
6305  int64_t Val = Imm.getSExtValue();
6306  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
6307    return true;
6308
6309  if ((int64_t)Val < 0)
6310    Val = ~Val;
6311  if (BitSize == 32)
6312    Val &= (1LL << 32) - 1;
6313
6314  unsigned LZ = countLeadingZeros((uint64_t)Val);
6315  unsigned Shift = (63 - LZ) / 16;
6316  // MOVZ is free so return true for one or fewer MOVK.
6317  return (Shift < 3) ? true : false;
6318}
6319
6320// Generate SUBS and CSEL for integer abs.
6321static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
6322  EVT VT = N->getValueType(0);
6323
6324  SDValue N0 = N->getOperand(0);
6325  SDValue N1 = N->getOperand(1);
6326  SDLoc DL(N);
6327
6328  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
6329  // and change it to SUB and CSEL.
6330  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
6331      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
6332      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
6333    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
6334      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
6335        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
6336                                  N0.getOperand(0));
6337        // Generate SUBS & CSEL.
6338        SDValue Cmp =
6339            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6340                        N0.getOperand(0), DAG.getConstant(0, VT));
6341        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
6342                           DAG.getConstant(AArch64CC::PL, MVT::i32),
6343                           SDValue(Cmp.getNode(), 1));
6344      }
6345  return SDValue();
6346}
6347
6348// performXorCombine - Attempts to handle integer ABS.
6349static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
6350                                 TargetLowering::DAGCombinerInfo &DCI,
6351                                 const AArch64Subtarget *Subtarget) {
6352  if (DCI.isBeforeLegalizeOps())
6353    return SDValue();
6354
6355  return performIntegerAbsCombine(N, DAG);
6356}
6357
6358static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
6359                                 TargetLowering::DAGCombinerInfo &DCI,
6360                                 const AArch64Subtarget *Subtarget) {
6361  if (DCI.isBeforeLegalizeOps())
6362    return SDValue();
6363
6364  // Multiplication of a power of two plus/minus one can be done more
6365  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
6366  // future CPUs have a cheaper MADD instruction, this may need to be
6367  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
6368  // 64-bit is 5 cycles, so this is always a win.
6369  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
6370    APInt Value = C->getAPIntValue();
6371    EVT VT = N->getValueType(0);
6372    if (Value.isNonNegative()) {
6373      // (mul x, 2^N + 1) => (add (shl x, N), x)
6374      APInt VM1 = Value - 1;
6375      if (VM1.isPowerOf2()) {
6376        SDValue ShiftedVal =
6377            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
6378                        DAG.getConstant(VM1.logBase2(), MVT::i64));
6379        return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
6380                           N->getOperand(0));
6381      }
6382      // (mul x, 2^N - 1) => (sub (shl x, N), x)
6383      APInt VP1 = Value + 1;
6384      if (VP1.isPowerOf2()) {
6385        SDValue ShiftedVal =
6386            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
6387                        DAG.getConstant(VP1.logBase2(), MVT::i64));
6388        return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
6389                           N->getOperand(0));
6390      }
6391    } else {
6392      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
6393      APInt VNM1 = -Value - 1;
6394      if (VNM1.isPowerOf2()) {
6395        SDValue ShiftedVal =
6396            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
6397                        DAG.getConstant(VNM1.logBase2(), MVT::i64));
6398        SDValue Add =
6399            DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
6400        return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
6401      }
6402      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
6403      APInt VNP1 = -Value + 1;
6404      if (VNP1.isPowerOf2()) {
6405        SDValue ShiftedVal =
6406            DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
6407                        DAG.getConstant(VNP1.logBase2(), MVT::i64));
6408        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
6409                           ShiftedVal);
6410      }
6411    }
6412  }
6413  return SDValue();
6414}
6415
6416static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
6417  EVT VT = N->getValueType(0);
6418  if (VT != MVT::f32 && VT != MVT::f64)
6419    return SDValue();
6420  // Only optimize when the source and destination types have the same width.
6421  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
6422    return SDValue();
6423
6424  // If the result of an integer load is only used by an integer-to-float
6425  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
6426  // This eliminates an "integer-to-vector-move UOP and improve throughput.
6427  SDValue N0 = N->getOperand(0);
6428  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
6429      // Do not change the width of a volatile load.
6430      !cast<LoadSDNode>(N0)->isVolatile()) {
6431    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
6432    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
6433                               LN0->getPointerInfo(), LN0->isVolatile(),
6434                               LN0->isNonTemporal(), LN0->isInvariant(),
6435                               LN0->getAlignment());
6436
6437    // Make sure successors of the original load stay after it by updating them
6438    // to use the new Chain.
6439    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
6440
6441    unsigned Opcode =
6442        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
6443    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
6444  }
6445
6446  return SDValue();
6447}
6448
6449/// An EXTR instruction is made up of two shifts, ORed together. This helper
6450/// searches for and classifies those shifts.
6451static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
6452                         bool &FromHi) {
6453  if (N.getOpcode() == ISD::SHL)
6454    FromHi = false;
6455  else if (N.getOpcode() == ISD::SRL)
6456    FromHi = true;
6457  else
6458    return false;
6459
6460  if (!isa<ConstantSDNode>(N.getOperand(1)))
6461    return false;
6462
6463  ShiftAmount = N->getConstantOperandVal(1);
6464  Src = N->getOperand(0);
6465  return true;
6466}
6467
6468/// EXTR instruction extracts a contiguous chunk of bits from two existing
6469/// registers viewed as a high/low pair. This function looks for the pattern:
6470/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
6471/// EXTR. Can't quite be done in TableGen because the two immediates aren't
6472/// independent.
6473static SDValue tryCombineToEXTR(SDNode *N,
6474                                TargetLowering::DAGCombinerInfo &DCI) {
6475  SelectionDAG &DAG = DCI.DAG;
6476  SDLoc DL(N);
6477  EVT VT = N->getValueType(0);
6478
6479  assert(N->getOpcode() == ISD::OR && "Unexpected root");
6480
6481  if (VT != MVT::i32 && VT != MVT::i64)
6482    return SDValue();
6483
6484  SDValue LHS;
6485  uint32_t ShiftLHS = 0;
6486  bool LHSFromHi = 0;
6487  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
6488    return SDValue();
6489
6490  SDValue RHS;
6491  uint32_t ShiftRHS = 0;
6492  bool RHSFromHi = 0;
6493  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
6494    return SDValue();
6495
6496  // If they're both trying to come from the high part of the register, they're
6497  // not really an EXTR.
6498  if (LHSFromHi == RHSFromHi)
6499    return SDValue();
6500
6501  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
6502    return SDValue();
6503
6504  if (LHSFromHi) {
6505    std::swap(LHS, RHS);
6506    std::swap(ShiftLHS, ShiftRHS);
6507  }
6508
6509  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
6510                     DAG.getConstant(ShiftRHS, MVT::i64));
6511}
6512
6513static SDValue tryCombineToBSL(SDNode *N,
6514                                TargetLowering::DAGCombinerInfo &DCI) {
6515  EVT VT = N->getValueType(0);
6516  SelectionDAG &DAG = DCI.DAG;
6517  SDLoc DL(N);
6518
6519  if (!VT.isVector())
6520    return SDValue();
6521
6522  SDValue N0 = N->getOperand(0);
6523  if (N0.getOpcode() != ISD::AND)
6524    return SDValue();
6525
6526  SDValue N1 = N->getOperand(1);
6527  if (N1.getOpcode() != ISD::AND)
6528    return SDValue();
6529
6530  // We only have to look for constant vectors here since the general, variable
6531  // case can be handled in TableGen.
6532  unsigned Bits = VT.getVectorElementType().getSizeInBits();
6533  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
6534  for (int i = 1; i >= 0; --i)
6535    for (int j = 1; j >= 0; --j) {
6536      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
6537      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
6538      if (!BVN0 || !BVN1)
6539        continue;
6540
6541      bool FoundMatch = true;
6542      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
6543        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
6544        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
6545        if (!CN0 || !CN1 ||
6546            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
6547          FoundMatch = false;
6548          break;
6549        }
6550      }
6551
6552      if (FoundMatch)
6553        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
6554                           N0->getOperand(1 - i), N1->getOperand(1 - j));
6555    }
6556
6557  return SDValue();
6558}
6559
6560static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
6561                                const AArch64Subtarget *Subtarget) {
6562  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
6563  if (!EnableAArch64ExtrGeneration)
6564    return SDValue();
6565  SelectionDAG &DAG = DCI.DAG;
6566  EVT VT = N->getValueType(0);
6567
6568  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
6569    return SDValue();
6570
6571  SDValue Res = tryCombineToEXTR(N, DCI);
6572  if (Res.getNode())
6573    return Res;
6574
6575  Res = tryCombineToBSL(N, DCI);
6576  if (Res.getNode())
6577    return Res;
6578
6579  return SDValue();
6580}
6581
6582static SDValue performBitcastCombine(SDNode *N,
6583                                     TargetLowering::DAGCombinerInfo &DCI,
6584                                     SelectionDAG &DAG) {
6585  // Wait 'til after everything is legalized to try this. That way we have
6586  // legal vector types and such.
6587  if (DCI.isBeforeLegalizeOps())
6588    return SDValue();
6589
6590  // Remove extraneous bitcasts around an extract_subvector.
6591  // For example,
6592  //    (v4i16 (bitconvert
6593  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
6594  //  becomes
6595  //    (extract_subvector ((v8i16 ...), (i64 4)))
6596
6597  // Only interested in 64-bit vectors as the ultimate result.
6598  EVT VT = N->getValueType(0);
6599  if (!VT.isVector())
6600    return SDValue();
6601  if (VT.getSimpleVT().getSizeInBits() != 64)
6602    return SDValue();
6603  // Is the operand an extract_subvector starting at the beginning or halfway
6604  // point of the vector? A low half may also come through as an
6605  // EXTRACT_SUBREG, so look for that, too.
6606  SDValue Op0 = N->getOperand(0);
6607  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
6608      !(Op0->isMachineOpcode() &&
6609        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
6610    return SDValue();
6611  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
6612  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6613    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
6614      return SDValue();
6615  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
6616    if (idx != AArch64::dsub)
6617      return SDValue();
6618    // The dsub reference is equivalent to a lane zero subvector reference.
6619    idx = 0;
6620  }
6621  // Look through the bitcast of the input to the extract.
6622  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
6623    return SDValue();
6624  SDValue Source = Op0->getOperand(0)->getOperand(0);
6625  // If the source type has twice the number of elements as our destination
6626  // type, we know this is an extract of the high or low half of the vector.
6627  EVT SVT = Source->getValueType(0);
6628  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
6629    return SDValue();
6630
6631  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
6632
6633  // Create the simplified form to just extract the low or high half of the
6634  // vector directly rather than bothering with the bitcasts.
6635  SDLoc dl(N);
6636  unsigned NumElements = VT.getVectorNumElements();
6637  if (idx) {
6638    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
6639    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
6640  } else {
6641    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
6642    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
6643                                      Source, SubReg),
6644                   0);
6645  }
6646}
6647
6648static SDValue performConcatVectorsCombine(SDNode *N,
6649                                           TargetLowering::DAGCombinerInfo &DCI,
6650                                           SelectionDAG &DAG) {
6651  // Wait 'til after everything is legalized to try this. That way we have
6652  // legal vector types and such.
6653  if (DCI.isBeforeLegalizeOps())
6654    return SDValue();
6655
6656  SDLoc dl(N);
6657  EVT VT = N->getValueType(0);
6658
6659  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
6660  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
6661  // canonicalise to that.
6662  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
6663    assert(VT.getVectorElementType().getSizeInBits() == 64);
6664    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
6665                       WidenVector(N->getOperand(0), DAG),
6666                       DAG.getConstant(0, MVT::i64));
6667  }
6668
6669  // Canonicalise concat_vectors so that the right-hand vector has as few
6670  // bit-casts as possible before its real operation. The primary matching
6671  // destination for these operations will be the narrowing "2" instructions,
6672  // which depend on the operation being performed on this right-hand vector.
6673  // For example,
6674  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
6675  // becomes
6676  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
6677
6678  SDValue Op1 = N->getOperand(1);
6679  if (Op1->getOpcode() != ISD::BITCAST)
6680    return SDValue();
6681  SDValue RHS = Op1->getOperand(0);
6682  MVT RHSTy = RHS.getValueType().getSimpleVT();
6683  // If the RHS is not a vector, this is not the pattern we're looking for.
6684  if (!RHSTy.isVector())
6685    return SDValue();
6686
6687  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
6688
6689  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
6690                                  RHSTy.getVectorNumElements() * 2);
6691  return DAG.getNode(
6692      ISD::BITCAST, dl, VT,
6693      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
6694                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
6695}
6696
6697static SDValue tryCombineFixedPointConvert(SDNode *N,
6698                                           TargetLowering::DAGCombinerInfo &DCI,
6699                                           SelectionDAG &DAG) {
6700  // Wait 'til after everything is legalized to try this. That way we have
6701  // legal vector types and such.
6702  if (DCI.isBeforeLegalizeOps())
6703    return SDValue();
6704  // Transform a scalar conversion of a value from a lane extract into a
6705  // lane extract of a vector conversion. E.g., from foo1 to foo2:
6706  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
6707  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
6708  //
6709  // The second form interacts better with instruction selection and the
6710  // register allocator to avoid cross-class register copies that aren't
6711  // coalescable due to a lane reference.
6712
6713  // Check the operand and see if it originates from a lane extract.
6714  SDValue Op1 = N->getOperand(1);
6715  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
6716    // Yep, no additional predication needed. Perform the transform.
6717    SDValue IID = N->getOperand(0);
6718    SDValue Shift = N->getOperand(2);
6719    SDValue Vec = Op1.getOperand(0);
6720    SDValue Lane = Op1.getOperand(1);
6721    EVT ResTy = N->getValueType(0);
6722    EVT VecResTy;
6723    SDLoc DL(N);
6724
6725    // The vector width should be 128 bits by the time we get here, even
6726    // if it started as 64 bits (the extract_vector handling will have
6727    // done so).
6728    assert(Vec.getValueType().getSizeInBits() == 128 &&
6729           "unexpected vector size on extract_vector_elt!");
6730    if (Vec.getValueType() == MVT::v4i32)
6731      VecResTy = MVT::v4f32;
6732    else if (Vec.getValueType() == MVT::v2i64)
6733      VecResTy = MVT::v2f64;
6734    else
6735      llvm_unreachable("unexpected vector type!");
6736
6737    SDValue Convert =
6738        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
6739    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
6740  }
6741  return SDValue();
6742}
6743
6744// AArch64 high-vector "long" operations are formed by performing the non-high
6745// version on an extract_subvector of each operand which gets the high half:
6746//
6747//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
6748//
6749// However, there are cases which don't have an extract_high explicitly, but
6750// have another operation that can be made compatible with one for free. For
6751// example:
6752//
6753//  (dupv64 scalar) --> (extract_high (dup128 scalar))
6754//
6755// This routine does the actual conversion of such DUPs, once outer routines
6756// have determined that everything else is in order.
6757static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
6758  // We can handle most types of duplicate, but the lane ones have an extra
6759  // operand saying *which* lane, so we need to know.
6760  bool IsDUPLANE;
6761  switch (N.getOpcode()) {
6762  case AArch64ISD::DUP:
6763    IsDUPLANE = false;
6764    break;
6765  case AArch64ISD::DUPLANE8:
6766  case AArch64ISD::DUPLANE16:
6767  case AArch64ISD::DUPLANE32:
6768  case AArch64ISD::DUPLANE64:
6769    IsDUPLANE = true;
6770    break;
6771  default:
6772    return SDValue();
6773  }
6774
6775  MVT NarrowTy = N.getSimpleValueType();
6776  if (!NarrowTy.is64BitVector())
6777    return SDValue();
6778
6779  MVT ElementTy = NarrowTy.getVectorElementType();
6780  unsigned NumElems = NarrowTy.getVectorNumElements();
6781  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
6782
6783  SDValue NewDUP;
6784  if (IsDUPLANE)
6785    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
6786                         N.getOperand(1));
6787  else
6788    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
6789
6790  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
6791                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
6792}
6793
6794static bool isEssentiallyExtractSubvector(SDValue N) {
6795  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
6796    return true;
6797
6798  return N.getOpcode() == ISD::BITCAST &&
6799         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
6800}
6801
6802/// \brief Helper structure to keep track of ISD::SET_CC operands.
6803struct GenericSetCCInfo {
6804  const SDValue *Opnd0;
6805  const SDValue *Opnd1;
6806  ISD::CondCode CC;
6807};
6808
6809/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
6810struct AArch64SetCCInfo {
6811  const SDValue *Cmp;
6812  AArch64CC::CondCode CC;
6813};
6814
6815/// \brief Helper structure to keep track of SetCC information.
6816union SetCCInfo {
6817  GenericSetCCInfo Generic;
6818  AArch64SetCCInfo AArch64;
6819};
6820
6821/// \brief Helper structure to be able to read SetCC information.  If set to
6822/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
6823/// GenericSetCCInfo.
6824struct SetCCInfoAndKind {
6825  SetCCInfo Info;
6826  bool IsAArch64;
6827};
6828
6829/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
6830/// an
6831/// AArch64 lowered one.
6832/// \p SetCCInfo is filled accordingly.
6833/// \post SetCCInfo is meanginfull only when this function returns true.
6834/// \return True when Op is a kind of SET_CC operation.
6835static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
6836  // If this is a setcc, this is straight forward.
6837  if (Op.getOpcode() == ISD::SETCC) {
6838    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
6839    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
6840    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6841    SetCCInfo.IsAArch64 = false;
6842    return true;
6843  }
6844  // Otherwise, check if this is a matching csel instruction.
6845  // In other words:
6846  // - csel 1, 0, cc
6847  // - csel 0, 1, !cc
6848  if (Op.getOpcode() != AArch64ISD::CSEL)
6849    return false;
6850  // Set the information about the operands.
6851  // TODO: we want the operands of the Cmp not the csel
6852  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
6853  SetCCInfo.IsAArch64 = true;
6854  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
6855      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
6856
6857  // Check that the operands matches the constraints:
6858  // (1) Both operands must be constants.
6859  // (2) One must be 1 and the other must be 0.
6860  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
6861  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6862
6863  // Check (1).
6864  if (!TValue || !FValue)
6865    return false;
6866
6867  // Check (2).
6868  if (!TValue->isOne()) {
6869    // Update the comparison when we are interested in !cc.
6870    std::swap(TValue, FValue);
6871    SetCCInfo.Info.AArch64.CC =
6872        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
6873  }
6874  return TValue->isOne() && FValue->isNullValue();
6875}
6876
6877// Returns true if Op is setcc or zext of setcc.
6878static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
6879  if (isSetCC(Op, Info))
6880    return true;
6881  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
6882    isSetCC(Op->getOperand(0), Info));
6883}
6884
6885// The folding we want to perform is:
6886// (add x, [zext] (setcc cc ...) )
6887//   -->
6888// (csel x, (add x, 1), !cc ...)
6889//
6890// The latter will get matched to a CSINC instruction.
6891static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
6892  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
6893  SDValue LHS = Op->getOperand(0);
6894  SDValue RHS = Op->getOperand(1);
6895  SetCCInfoAndKind InfoAndKind;
6896
6897  // If neither operand is a SET_CC, give up.
6898  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
6899    std::swap(LHS, RHS);
6900    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
6901      return SDValue();
6902  }
6903
6904  // FIXME: This could be generatized to work for FP comparisons.
6905  EVT CmpVT = InfoAndKind.IsAArch64
6906                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
6907                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
6908  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
6909    return SDValue();
6910
6911  SDValue CCVal;
6912  SDValue Cmp;
6913  SDLoc dl(Op);
6914  if (InfoAndKind.IsAArch64) {
6915    CCVal = DAG.getConstant(
6916        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
6917    Cmp = *InfoAndKind.Info.AArch64.Cmp;
6918  } else
6919    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
6920                      *InfoAndKind.Info.Generic.Opnd1,
6921                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
6922                      CCVal, DAG, dl);
6923
6924  EVT VT = Op->getValueType(0);
6925  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
6926  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
6927}
6928
6929// The basic add/sub long vector instructions have variants with "2" on the end
6930// which act on the high-half of their inputs. They are normally matched by
6931// patterns like:
6932//
6933// (add (zeroext (extract_high LHS)),
6934//      (zeroext (extract_high RHS)))
6935// -> uaddl2 vD, vN, vM
6936//
6937// However, if one of the extracts is something like a duplicate, this
6938// instruction can still be used profitably. This function puts the DAG into a
6939// more appropriate form for those patterns to trigger.
6940static SDValue performAddSubLongCombine(SDNode *N,
6941                                        TargetLowering::DAGCombinerInfo &DCI,
6942                                        SelectionDAG &DAG) {
6943  if (DCI.isBeforeLegalizeOps())
6944    return SDValue();
6945
6946  MVT VT = N->getSimpleValueType(0);
6947  if (!VT.is128BitVector()) {
6948    if (N->getOpcode() == ISD::ADD)
6949      return performSetccAddFolding(N, DAG);
6950    return SDValue();
6951  }
6952
6953  // Make sure both branches are extended in the same way.
6954  SDValue LHS = N->getOperand(0);
6955  SDValue RHS = N->getOperand(1);
6956  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
6957       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
6958      LHS.getOpcode() != RHS.getOpcode())
6959    return SDValue();
6960
6961  unsigned ExtType = LHS.getOpcode();
6962
6963  // It's not worth doing if at least one of the inputs isn't already an
6964  // extract, but we don't know which it'll be so we have to try both.
6965  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
6966    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
6967    if (!RHS.getNode())
6968      return SDValue();
6969
6970    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
6971  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
6972    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
6973    if (!LHS.getNode())
6974      return SDValue();
6975
6976    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
6977  }
6978
6979  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
6980}
6981
6982// Massage DAGs which we can use the high-half "long" operations on into
6983// something isel will recognize better. E.g.
6984//
6985// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
6986//   (aarch64_neon_umull (extract_high (v2i64 vec)))
6987//                     (extract_high (v2i64 (dup128 scalar)))))
6988//
6989static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
6990                                       TargetLowering::DAGCombinerInfo &DCI,
6991                                       SelectionDAG &DAG) {
6992  if (DCI.isBeforeLegalizeOps())
6993    return SDValue();
6994
6995  SDValue LHS = N->getOperand(1);
6996  SDValue RHS = N->getOperand(2);
6997  assert(LHS.getValueType().is64BitVector() &&
6998         RHS.getValueType().is64BitVector() &&
6999         "unexpected shape for long operation");
7000
7001  // Either node could be a DUP, but it's not worth doing both of them (you'd
7002  // just as well use the non-high version) so look for a corresponding extract
7003  // operation on the other "wing".
7004  if (isEssentiallyExtractSubvector(LHS)) {
7005    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
7006    if (!RHS.getNode())
7007      return SDValue();
7008  } else if (isEssentiallyExtractSubvector(RHS)) {
7009    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
7010    if (!LHS.getNode())
7011      return SDValue();
7012  }
7013
7014  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
7015                     N->getOperand(0), LHS, RHS);
7016}
7017
7018static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
7019  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
7020  unsigned ElemBits = ElemTy.getSizeInBits();
7021
7022  int64_t ShiftAmount;
7023  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
7024    APInt SplatValue, SplatUndef;
7025    unsigned SplatBitSize;
7026    bool HasAnyUndefs;
7027    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
7028                              HasAnyUndefs, ElemBits) ||
7029        SplatBitSize != ElemBits)
7030      return SDValue();
7031
7032    ShiftAmount = SplatValue.getSExtValue();
7033  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
7034    ShiftAmount = CVN->getSExtValue();
7035  } else
7036    return SDValue();
7037
7038  unsigned Opcode;
7039  bool IsRightShift;
7040  switch (IID) {
7041  default:
7042    llvm_unreachable("Unknown shift intrinsic");
7043  case Intrinsic::aarch64_neon_sqshl:
7044    Opcode = AArch64ISD::SQSHL_I;
7045    IsRightShift = false;
7046    break;
7047  case Intrinsic::aarch64_neon_uqshl:
7048    Opcode = AArch64ISD::UQSHL_I;
7049    IsRightShift = false;
7050    break;
7051  case Intrinsic::aarch64_neon_srshl:
7052    Opcode = AArch64ISD::SRSHR_I;
7053    IsRightShift = true;
7054    break;
7055  case Intrinsic::aarch64_neon_urshl:
7056    Opcode = AArch64ISD::URSHR_I;
7057    IsRightShift = true;
7058    break;
7059  case Intrinsic::aarch64_neon_sqshlu:
7060    Opcode = AArch64ISD::SQSHLU_I;
7061    IsRightShift = false;
7062    break;
7063  }
7064
7065  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
7066    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
7067                       DAG.getConstant(-ShiftAmount, MVT::i32));
7068  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
7069    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
7070                       DAG.getConstant(ShiftAmount, MVT::i32));
7071
7072  return SDValue();
7073}
7074
7075// The CRC32[BH] instructions ignore the high bits of their data operand. Since
7076// the intrinsics must be legal and take an i32, this means there's almost
7077// certainly going to be a zext in the DAG which we can eliminate.
7078static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
7079  SDValue AndN = N->getOperand(2);
7080  if (AndN.getOpcode() != ISD::AND)
7081    return SDValue();
7082
7083  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
7084  if (!CMask || CMask->getZExtValue() != Mask)
7085    return SDValue();
7086
7087  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
7088                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
7089}
7090
7091static SDValue performIntrinsicCombine(SDNode *N,
7092                                       TargetLowering::DAGCombinerInfo &DCI,
7093                                       const AArch64Subtarget *Subtarget) {
7094  SelectionDAG &DAG = DCI.DAG;
7095  unsigned IID = getIntrinsicID(N);
7096  switch (IID) {
7097  default:
7098    break;
7099  case Intrinsic::aarch64_neon_vcvtfxs2fp:
7100  case Intrinsic::aarch64_neon_vcvtfxu2fp:
7101    return tryCombineFixedPointConvert(N, DCI, DAG);
7102    break;
7103  case Intrinsic::aarch64_neon_fmax:
7104    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
7105                       N->getOperand(1), N->getOperand(2));
7106  case Intrinsic::aarch64_neon_fmin:
7107    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
7108                       N->getOperand(1), N->getOperand(2));
7109  case Intrinsic::aarch64_neon_smull:
7110  case Intrinsic::aarch64_neon_umull:
7111  case Intrinsic::aarch64_neon_pmull:
7112  case Intrinsic::aarch64_neon_sqdmull:
7113    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
7114  case Intrinsic::aarch64_neon_sqshl:
7115  case Intrinsic::aarch64_neon_uqshl:
7116  case Intrinsic::aarch64_neon_sqshlu:
7117  case Intrinsic::aarch64_neon_srshl:
7118  case Intrinsic::aarch64_neon_urshl:
7119    return tryCombineShiftImm(IID, N, DAG);
7120  case Intrinsic::aarch64_crc32b:
7121  case Intrinsic::aarch64_crc32cb:
7122    return tryCombineCRC32(0xff, N, DAG);
7123  case Intrinsic::aarch64_crc32h:
7124  case Intrinsic::aarch64_crc32ch:
7125    return tryCombineCRC32(0xffff, N, DAG);
7126  }
7127  return SDValue();
7128}
7129
7130static SDValue performExtendCombine(SDNode *N,
7131                                    TargetLowering::DAGCombinerInfo &DCI,
7132                                    SelectionDAG &DAG) {
7133  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
7134  // we can convert that DUP into another extract_high (of a bigger DUP), which
7135  // helps the backend to decide that an sabdl2 would be useful, saving a real
7136  // extract_high operation.
7137  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
7138      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
7139    SDNode *ABDNode = N->getOperand(0).getNode();
7140    unsigned IID = getIntrinsicID(ABDNode);
7141    if (IID == Intrinsic::aarch64_neon_sabd ||
7142        IID == Intrinsic::aarch64_neon_uabd) {
7143      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
7144      if (!NewABD.getNode())
7145        return SDValue();
7146
7147      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
7148                         NewABD);
7149    }
7150  }
7151
7152  // This is effectively a custom type legalization for AArch64.
7153  //
7154  // Type legalization will split an extend of a small, legal, type to a larger
7155  // illegal type by first splitting the destination type, often creating
7156  // illegal source types, which then get legalized in isel-confusing ways,
7157  // leading to really terrible codegen. E.g.,
7158  //   %result = v8i32 sext v8i8 %value
7159  // becomes
7160  //   %losrc = extract_subreg %value, ...
7161  //   %hisrc = extract_subreg %value, ...
7162  //   %lo = v4i32 sext v4i8 %losrc
7163  //   %hi = v4i32 sext v4i8 %hisrc
7164  // Things go rapidly downhill from there.
7165  //
7166  // For AArch64, the [sz]ext vector instructions can only go up one element
7167  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
7168  // take two instructions.
7169  //
7170  // This implies that the most efficient way to do the extend from v8i8
7171  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
7172  // the normal splitting to happen for the v8i16->v8i32.
7173
7174  // This is pre-legalization to catch some cases where the default
7175  // type legalization will create ill-tempered code.
7176  if (!DCI.isBeforeLegalizeOps())
7177    return SDValue();
7178
7179  // We're only interested in cleaning things up for non-legal vector types
7180  // here. If both the source and destination are legal, things will just
7181  // work naturally without any fiddling.
7182  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7183  EVT ResVT = N->getValueType(0);
7184  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
7185    return SDValue();
7186  // If the vector type isn't a simple VT, it's beyond the scope of what
7187  // we're  worried about here. Let legalization do its thing and hope for
7188  // the best.
7189  if (!ResVT.isSimple())
7190    return SDValue();
7191
7192  SDValue Src = N->getOperand(0);
7193  MVT SrcVT = Src->getValueType(0).getSimpleVT();
7194  // If the source VT is a 64-bit vector, we can play games and get the
7195  // better results we want.
7196  if (SrcVT.getSizeInBits() != 64)
7197    return SDValue();
7198
7199  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
7200  unsigned ElementCount = SrcVT.getVectorNumElements();
7201  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
7202  SDLoc DL(N);
7203  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
7204
7205  // Now split the rest of the operation into two halves, each with a 64
7206  // bit source.
7207  EVT LoVT, HiVT;
7208  SDValue Lo, Hi;
7209  unsigned NumElements = ResVT.getVectorNumElements();
7210  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
7211  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
7212                                 ResVT.getVectorElementType(), NumElements / 2);
7213
7214  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
7215                               LoVT.getVectorNumElements());
7216  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
7217                   DAG.getIntPtrConstant(0));
7218  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
7219                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
7220  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
7221  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
7222
7223  // Now combine the parts back together so we still have a single result
7224  // like the combiner expects.
7225  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
7226}
7227
7228/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
7229/// value. The load store optimizer pass will merge them to store pair stores.
7230/// This has better performance than a splat of the scalar followed by a split
7231/// vector store. Even if the stores are not merged it is four stores vs a dup,
7232/// followed by an ext.b and two stores.
7233static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
7234  SDValue StVal = St->getValue();
7235  EVT VT = StVal.getValueType();
7236
7237  // Don't replace floating point stores, they possibly won't be transformed to
7238  // stp because of the store pair suppress pass.
7239  if (VT.isFloatingPoint())
7240    return SDValue();
7241
7242  // Check for insert vector elements.
7243  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
7244    return SDValue();
7245
7246  // We can express a splat as store pair(s) for 2 or 4 elements.
7247  unsigned NumVecElts = VT.getVectorNumElements();
7248  if (NumVecElts != 4 && NumVecElts != 2)
7249    return SDValue();
7250  SDValue SplatVal = StVal.getOperand(1);
7251  unsigned RemainInsertElts = NumVecElts - 1;
7252
7253  // Check that this is a splat.
7254  while (--RemainInsertElts) {
7255    SDValue NextInsertElt = StVal.getOperand(0);
7256    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
7257      return SDValue();
7258    if (NextInsertElt.getOperand(1) != SplatVal)
7259      return SDValue();
7260    StVal = NextInsertElt;
7261  }
7262  unsigned OrigAlignment = St->getAlignment();
7263  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
7264  unsigned Alignment = std::min(OrigAlignment, EltOffset);
7265
7266  // Create scalar stores. This is at least as good as the code sequence for a
7267  // split unaligned store wich is a dup.s, ext.b, and two stores.
7268  // Most of the time the three stores should be replaced by store pair
7269  // instructions (stp).
7270  SDLoc DL(St);
7271  SDValue BasePtr = St->getBasePtr();
7272  SDValue NewST1 =
7273      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
7274                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
7275
7276  unsigned Offset = EltOffset;
7277  while (--NumVecElts) {
7278    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
7279                                    DAG.getConstant(Offset, MVT::i64));
7280    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
7281                          St->getPointerInfo(), St->isVolatile(),
7282                          St->isNonTemporal(), Alignment);
7283    Offset += EltOffset;
7284  }
7285  return NewST1;
7286}
7287
7288static SDValue performSTORECombine(SDNode *N,
7289                                   TargetLowering::DAGCombinerInfo &DCI,
7290                                   SelectionDAG &DAG,
7291                                   const AArch64Subtarget *Subtarget) {
7292  if (!DCI.isBeforeLegalize())
7293    return SDValue();
7294
7295  StoreSDNode *S = cast<StoreSDNode>(N);
7296  if (S->isVolatile())
7297    return SDValue();
7298
7299  // Cyclone has bad performance on unaligned 16B stores when crossing line and
7300  // page boundries. We want to split such stores.
7301  if (!Subtarget->isCyclone())
7302    return SDValue();
7303
7304  // Don't split at Oz.
7305  MachineFunction &MF = DAG.getMachineFunction();
7306  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
7307      AttributeSet::FunctionIndex, Attribute::MinSize);
7308  if (IsMinSize)
7309    return SDValue();
7310
7311  SDValue StVal = S->getValue();
7312  EVT VT = StVal.getValueType();
7313
7314  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
7315  // those up regresses performance on micro-benchmarks and olden/bh.
7316  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
7317    return SDValue();
7318
7319  // Split unaligned 16B stores. They are terrible for performance.
7320  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
7321  // extensions can use this to mark that it does not want splitting to happen
7322  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
7323  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
7324  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
7325      S->getAlignment() <= 2)
7326    return SDValue();
7327
7328  // If we get a splat of a scalar convert this vector store to a store of
7329  // scalars. They will be merged into store pairs thereby removing two
7330  // instructions.
7331  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
7332  if (ReplacedSplat != SDValue())
7333    return ReplacedSplat;
7334
7335  SDLoc DL(S);
7336  unsigned NumElts = VT.getVectorNumElements() / 2;
7337  // Split VT into two.
7338  EVT HalfVT =
7339      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
7340  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
7341                                   DAG.getIntPtrConstant(0));
7342  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
7343                                   DAG.getIntPtrConstant(NumElts));
7344  SDValue BasePtr = S->getBasePtr();
7345  SDValue NewST1 =
7346      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
7347                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
7348  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
7349                                  DAG.getConstant(8, MVT::i64));
7350  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
7351                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
7352                      S->getAlignment());
7353}
7354
7355/// Target-specific DAG combine function for post-increment LD1 (lane) and
7356/// post-increment LD1R.
7357static SDValue performPostLD1Combine(SDNode *N,
7358                                     TargetLowering::DAGCombinerInfo &DCI,
7359                                     bool IsLaneOp) {
7360  if (DCI.isBeforeLegalizeOps())
7361    return SDValue();
7362
7363  SelectionDAG &DAG = DCI.DAG;
7364  EVT VT = N->getValueType(0);
7365
7366  unsigned LoadIdx = IsLaneOp ? 1 : 0;
7367  SDNode *LD = N->getOperand(LoadIdx).getNode();
7368  // If it is not LOAD, can not do such combine.
7369  if (LD->getOpcode() != ISD::LOAD)
7370    return SDValue();
7371
7372  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
7373  EVT MemVT = LoadSDN->getMemoryVT();
7374  // Check if memory operand is the same type as the vector element.
7375  if (MemVT != VT.getVectorElementType())
7376    return SDValue();
7377
7378  // Check if there are other uses. If so, do not combine as it will introduce
7379  // an extra load.
7380  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
7381       ++UI) {
7382    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
7383      continue;
7384    if (*UI != N)
7385      return SDValue();
7386  }
7387
7388  SDValue Addr = LD->getOperand(1);
7389  SDValue Vector = N->getOperand(0);
7390  // Search for a use of the address operand that is an increment.
7391  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
7392       Addr.getNode()->use_end(); UI != UE; ++UI) {
7393    SDNode *User = *UI;
7394    if (User->getOpcode() != ISD::ADD
7395        || UI.getUse().getResNo() != Addr.getResNo())
7396      continue;
7397
7398    // Check that the add is independent of the load.  Otherwise, folding it
7399    // would create a cycle.
7400    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
7401      continue;
7402    // Also check that add is not used in the vector operand.  This would also
7403    // create a cycle.
7404    if (User->isPredecessorOf(Vector.getNode()))
7405      continue;
7406
7407    // If the increment is a constant, it must match the memory ref size.
7408    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
7409    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
7410      uint32_t IncVal = CInc->getZExtValue();
7411      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
7412      if (IncVal != NumBytes)
7413        continue;
7414      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
7415    }
7416
7417    SmallVector<SDValue, 8> Ops;
7418    Ops.push_back(LD->getOperand(0));  // Chain
7419    if (IsLaneOp) {
7420      Ops.push_back(Vector);           // The vector to be inserted
7421      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
7422    }
7423    Ops.push_back(Addr);
7424    Ops.push_back(Inc);
7425
7426    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
7427    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
7428    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
7429    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
7430                                           MemVT,
7431                                           LoadSDN->getMemOperand());
7432
7433    // Update the uses.
7434    std::vector<SDValue> NewResults;
7435    NewResults.push_back(SDValue(LD, 0));             // The result of load
7436    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
7437    DCI.CombineTo(LD, NewResults);
7438    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
7439    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
7440
7441    break;
7442  }
7443  return SDValue();
7444}
7445
7446/// Target-specific DAG combine function for NEON load/store intrinsics
7447/// to merge base address updates.
7448static SDValue performNEONPostLDSTCombine(SDNode *N,
7449                                          TargetLowering::DAGCombinerInfo &DCI,
7450                                          SelectionDAG &DAG) {
7451  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
7452    return SDValue();
7453
7454  unsigned AddrOpIdx = N->getNumOperands() - 1;
7455  SDValue Addr = N->getOperand(AddrOpIdx);
7456
7457  // Search for a use of the address operand that is an increment.
7458  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
7459       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
7460    SDNode *User = *UI;
7461    if (User->getOpcode() != ISD::ADD ||
7462        UI.getUse().getResNo() != Addr.getResNo())
7463      continue;
7464
7465    // Check that the add is independent of the load/store.  Otherwise, folding
7466    // it would create a cycle.
7467    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
7468      continue;
7469
7470    // Find the new opcode for the updating load/store.
7471    bool IsStore = false;
7472    bool IsLaneOp = false;
7473    bool IsDupOp = false;
7474    unsigned NewOpc = 0;
7475    unsigned NumVecs = 0;
7476    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
7477    switch (IntNo) {
7478    default: llvm_unreachable("unexpected intrinsic for Neon base update");
7479    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
7480      NumVecs = 2; break;
7481    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
7482      NumVecs = 3; break;
7483    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
7484      NumVecs = 4; break;
7485    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
7486      NumVecs = 2; IsStore = true; break;
7487    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
7488      NumVecs = 3; IsStore = true; break;
7489    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
7490      NumVecs = 4; IsStore = true; break;
7491    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
7492      NumVecs = 2; break;
7493    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
7494      NumVecs = 3; break;
7495    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
7496      NumVecs = 4; break;
7497    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
7498      NumVecs = 2; IsStore = true; break;
7499    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
7500      NumVecs = 3; IsStore = true; break;
7501    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
7502      NumVecs = 4; IsStore = true; break;
7503    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
7504      NumVecs = 2; IsDupOp = true; break;
7505    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
7506      NumVecs = 3; IsDupOp = true; break;
7507    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
7508      NumVecs = 4; IsDupOp = true; break;
7509    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
7510      NumVecs = 2; IsLaneOp = true; break;
7511    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
7512      NumVecs = 3; IsLaneOp = true; break;
7513    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
7514      NumVecs = 4; IsLaneOp = true; break;
7515    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
7516      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
7517    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
7518      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
7519    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
7520      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
7521    }
7522
7523    EVT VecTy;
7524    if (IsStore)
7525      VecTy = N->getOperand(2).getValueType();
7526    else
7527      VecTy = N->getValueType(0);
7528
7529    // If the increment is a constant, it must match the memory ref size.
7530    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
7531    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
7532      uint32_t IncVal = CInc->getZExtValue();
7533      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
7534      if (IsLaneOp || IsDupOp)
7535        NumBytes /= VecTy.getVectorNumElements();
7536      if (IncVal != NumBytes)
7537        continue;
7538      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
7539    }
7540    SmallVector<SDValue, 8> Ops;
7541    Ops.push_back(N->getOperand(0)); // Incoming chain
7542    // Load lane and store have vector list as input.
7543    if (IsLaneOp || IsStore)
7544      for (unsigned i = 2; i < AddrOpIdx; ++i)
7545        Ops.push_back(N->getOperand(i));
7546    Ops.push_back(Addr); // Base register
7547    Ops.push_back(Inc);
7548
7549    // Return Types.
7550    EVT Tys[6];
7551    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
7552    unsigned n;
7553    for (n = 0; n < NumResultVecs; ++n)
7554      Tys[n] = VecTy;
7555    Tys[n++] = MVT::i64;  // Type of write back register
7556    Tys[n] = MVT::Other;  // Type of the chain
7557    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
7558
7559    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
7560    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
7561                                           MemInt->getMemoryVT(),
7562                                           MemInt->getMemOperand());
7563
7564    // Update the uses.
7565    std::vector<SDValue> NewResults;
7566    for (unsigned i = 0; i < NumResultVecs; ++i) {
7567      NewResults.push_back(SDValue(UpdN.getNode(), i));
7568    }
7569    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
7570    DCI.CombineTo(N, NewResults);
7571    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
7572
7573    break;
7574  }
7575  return SDValue();
7576}
7577
7578// Optimize compare with zero and branch.
7579static SDValue performBRCONDCombine(SDNode *N,
7580                                    TargetLowering::DAGCombinerInfo &DCI,
7581                                    SelectionDAG &DAG) {
7582  SDValue Chain = N->getOperand(0);
7583  SDValue Dest = N->getOperand(1);
7584  SDValue CCVal = N->getOperand(2);
7585  SDValue Cmp = N->getOperand(3);
7586
7587  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
7588  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
7589  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
7590    return SDValue();
7591
7592  unsigned CmpOpc = Cmp.getOpcode();
7593  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
7594    return SDValue();
7595
7596  // Only attempt folding if there is only one use of the flag and no use of the
7597  // value.
7598  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
7599    return SDValue();
7600
7601  SDValue LHS = Cmp.getOperand(0);
7602  SDValue RHS = Cmp.getOperand(1);
7603
7604  assert(LHS.getValueType() == RHS.getValueType() &&
7605         "Expected the value type to be the same for both operands!");
7606  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
7607    return SDValue();
7608
7609  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
7610    std::swap(LHS, RHS);
7611
7612  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
7613    return SDValue();
7614
7615  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
7616      LHS.getOpcode() == ISD::SRL)
7617    return SDValue();
7618
7619  // Fold the compare into the branch instruction.
7620  SDValue BR;
7621  if (CC == AArch64CC::EQ)
7622    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
7623  else
7624    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
7625
7626  // Do not add new nodes to DAG combiner worklist.
7627  DCI.CombineTo(N, BR, false);
7628
7629  return SDValue();
7630}
7631
7632// vselect (v1i1 setcc) ->
7633//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
7634// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
7635// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
7636// such VSELECT.
7637static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
7638  SDValue N0 = N->getOperand(0);
7639  EVT CCVT = N0.getValueType();
7640
7641  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
7642      CCVT.getVectorElementType() != MVT::i1)
7643    return SDValue();
7644
7645  EVT ResVT = N->getValueType(0);
7646  EVT CmpVT = N0.getOperand(0).getValueType();
7647  // Only combine when the result type is of the same size as the compared
7648  // operands.
7649  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
7650    return SDValue();
7651
7652  SDValue IfTrue = N->getOperand(1);
7653  SDValue IfFalse = N->getOperand(2);
7654  SDValue SetCC =
7655      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
7656                   N0.getOperand(0), N0.getOperand(1),
7657                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
7658  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
7659                     IfTrue, IfFalse);
7660}
7661
7662/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
7663/// the compare-mask instructions rather than going via NZCV, even if LHS and
7664/// RHS are really scalar. This replaces any scalar setcc in the above pattern
7665/// with a vector one followed by a DUP shuffle on the result.
7666static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
7667  SDValue N0 = N->getOperand(0);
7668  EVT ResVT = N->getValueType(0);
7669
7670  if (!N->getOperand(1).getValueType().isVector())
7671    return SDValue();
7672
7673  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
7674    return SDValue();
7675
7676  SDLoc DL(N0);
7677
7678  EVT SrcVT = N0.getOperand(0).getValueType();
7679  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
7680                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
7681  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
7682
7683  // First perform a vector comparison, where lane 0 is the one we're interested
7684  // in.
7685  SDValue LHS =
7686      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
7687  SDValue RHS =
7688      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
7689  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
7690
7691  // Now duplicate the comparison mask we want across all other lanes.
7692  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
7693  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
7694  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
7695                     Mask);
7696
7697  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
7698}
7699
7700SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
7701                                                 DAGCombinerInfo &DCI) const {
7702  SelectionDAG &DAG = DCI.DAG;
7703  switch (N->getOpcode()) {
7704  default:
7705    break;
7706  case ISD::ADD:
7707  case ISD::SUB:
7708    return performAddSubLongCombine(N, DCI, DAG);
7709  case ISD::XOR:
7710    return performXorCombine(N, DAG, DCI, Subtarget);
7711  case ISD::MUL:
7712    return performMulCombine(N, DAG, DCI, Subtarget);
7713  case ISD::SINT_TO_FP:
7714  case ISD::UINT_TO_FP:
7715    return performIntToFpCombine(N, DAG);
7716  case ISD::OR:
7717    return performORCombine(N, DCI, Subtarget);
7718  case ISD::INTRINSIC_WO_CHAIN:
7719    return performIntrinsicCombine(N, DCI, Subtarget);
7720  case ISD::ANY_EXTEND:
7721  case ISD::ZERO_EXTEND:
7722  case ISD::SIGN_EXTEND:
7723    return performExtendCombine(N, DCI, DAG);
7724  case ISD::BITCAST:
7725    return performBitcastCombine(N, DCI, DAG);
7726  case ISD::CONCAT_VECTORS:
7727    return performConcatVectorsCombine(N, DCI, DAG);
7728  case ISD::SELECT:
7729    return performSelectCombine(N, DAG);
7730  case ISD::VSELECT:
7731    return performVSelectCombine(N, DCI.DAG);
7732  case ISD::STORE:
7733    return performSTORECombine(N, DCI, DAG, Subtarget);
7734  case AArch64ISD::BRCOND:
7735    return performBRCONDCombine(N, DCI, DAG);
7736  case AArch64ISD::DUP:
7737    return performPostLD1Combine(N, DCI, false);
7738  case ISD::INSERT_VECTOR_ELT:
7739    return performPostLD1Combine(N, DCI, true);
7740  case ISD::INTRINSIC_VOID:
7741  case ISD::INTRINSIC_W_CHAIN:
7742    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
7743    case Intrinsic::aarch64_neon_ld2:
7744    case Intrinsic::aarch64_neon_ld3:
7745    case Intrinsic::aarch64_neon_ld4:
7746    case Intrinsic::aarch64_neon_ld1x2:
7747    case Intrinsic::aarch64_neon_ld1x3:
7748    case Intrinsic::aarch64_neon_ld1x4:
7749    case Intrinsic::aarch64_neon_ld2lane:
7750    case Intrinsic::aarch64_neon_ld3lane:
7751    case Intrinsic::aarch64_neon_ld4lane:
7752    case Intrinsic::aarch64_neon_ld2r:
7753    case Intrinsic::aarch64_neon_ld3r:
7754    case Intrinsic::aarch64_neon_ld4r:
7755    case Intrinsic::aarch64_neon_st2:
7756    case Intrinsic::aarch64_neon_st3:
7757    case Intrinsic::aarch64_neon_st4:
7758    case Intrinsic::aarch64_neon_st1x2:
7759    case Intrinsic::aarch64_neon_st1x3:
7760    case Intrinsic::aarch64_neon_st1x4:
7761    case Intrinsic::aarch64_neon_st2lane:
7762    case Intrinsic::aarch64_neon_st3lane:
7763    case Intrinsic::aarch64_neon_st4lane:
7764      return performNEONPostLDSTCombine(N, DCI, DAG);
7765    default:
7766      break;
7767    }
7768  }
7769  return SDValue();
7770}
7771
7772// Check if the return value is used as only a return value, as otherwise
7773// we can't perform a tail-call. In particular, we need to check for
7774// target ISD nodes that are returns and any other "odd" constructs
7775// that the generic analysis code won't necessarily catch.
7776bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
7777                                               SDValue &Chain) const {
7778  if (N->getNumValues() != 1)
7779    return false;
7780  if (!N->hasNUsesOfValue(1, 0))
7781    return false;
7782
7783  SDValue TCChain = Chain;
7784  SDNode *Copy = *N->use_begin();
7785  if (Copy->getOpcode() == ISD::CopyToReg) {
7786    // If the copy has a glue operand, we conservatively assume it isn't safe to
7787    // perform a tail call.
7788    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
7789        MVT::Glue)
7790      return false;
7791    TCChain = Copy->getOperand(0);
7792  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
7793    return false;
7794
7795  bool HasRet = false;
7796  for (SDNode *Node : Copy->uses()) {
7797    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
7798      return false;
7799    HasRet = true;
7800  }
7801
7802  if (!HasRet)
7803    return false;
7804
7805  Chain = TCChain;
7806  return true;
7807}
7808
7809// Return whether the an instruction can potentially be optimized to a tail
7810// call. This will cause the optimizers to attempt to move, or duplicate,
7811// return instructions to help enable tail call optimizations for this
7812// instruction.
7813bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
7814  if (!CI->isTailCall())
7815    return false;
7816
7817  return true;
7818}
7819
7820bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
7821                                                   SDValue &Offset,
7822                                                   ISD::MemIndexedMode &AM,
7823                                                   bool &IsInc,
7824                                                   SelectionDAG &DAG) const {
7825  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
7826    return false;
7827
7828  Base = Op->getOperand(0);
7829  // All of the indexed addressing mode instructions take a signed
7830  // 9 bit immediate offset.
7831  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
7832    int64_t RHSC = (int64_t)RHS->getZExtValue();
7833    if (RHSC >= 256 || RHSC <= -256)
7834      return false;
7835    IsInc = (Op->getOpcode() == ISD::ADD);
7836    Offset = Op->getOperand(1);
7837    return true;
7838  }
7839  return false;
7840}
7841
7842bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
7843                                                      SDValue &Offset,
7844                                                      ISD::MemIndexedMode &AM,
7845                                                      SelectionDAG &DAG) const {
7846  EVT VT;
7847  SDValue Ptr;
7848  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
7849    VT = LD->getMemoryVT();
7850    Ptr = LD->getBasePtr();
7851  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
7852    VT = ST->getMemoryVT();
7853    Ptr = ST->getBasePtr();
7854  } else
7855    return false;
7856
7857  bool IsInc;
7858  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
7859    return false;
7860  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
7861  return true;
7862}
7863
7864bool AArch64TargetLowering::getPostIndexedAddressParts(
7865    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
7866    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
7867  EVT VT;
7868  SDValue Ptr;
7869  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
7870    VT = LD->getMemoryVT();
7871    Ptr = LD->getBasePtr();
7872  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
7873    VT = ST->getMemoryVT();
7874    Ptr = ST->getBasePtr();
7875  } else
7876    return false;
7877
7878  bool IsInc;
7879  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
7880    return false;
7881  // Post-indexing updates the base, so it's not a valid transform
7882  // if that's not the same as the load's pointer.
7883  if (Ptr != Base)
7884    return false;
7885  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
7886  return true;
7887}
7888
7889void AArch64TargetLowering::ReplaceNodeResults(
7890    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
7891  switch (N->getOpcode()) {
7892  default:
7893    llvm_unreachable("Don't know how to custom expand this");
7894  case ISD::FP_TO_UINT:
7895  case ISD::FP_TO_SINT:
7896    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
7897    // Let normal code take care of it by not adding anything to Results.
7898    return;
7899  }
7900}
7901
7902bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
7903  // Loads and stores less than 128-bits are already atomic; ones above that
7904  // are doomed anyway, so defer to the default libcall and blame the OS when
7905  // things go wrong:
7906  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
7907    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
7908  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
7909    return LI->getType()->getPrimitiveSizeInBits() == 128;
7910
7911  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
7912  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
7913}
7914
7915TargetLoweringBase::LegalizeTypeAction
7916AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
7917  MVT SVT = VT.getSimpleVT();
7918  // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
7919  // v4i16, v2i32 instead of to promote.
7920  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
7921      || SVT == MVT::v1f32)
7922    return TypeWidenVector;
7923
7924  return TargetLoweringBase::getPreferredVectorAction(VT);
7925}
7926
7927Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
7928                                             AtomicOrdering Ord) const {
7929  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
7930  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
7931  bool IsAcquire =
7932      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
7933
7934  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
7935  // intrinsic must return {i64, i64} and we have to recombine them into a
7936  // single i128 here.
7937  if (ValTy->getPrimitiveSizeInBits() == 128) {
7938    Intrinsic::ID Int =
7939        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
7940    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
7941
7942    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
7943    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
7944
7945    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
7946    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
7947    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
7948    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
7949    return Builder.CreateOr(
7950        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
7951  }
7952
7953  Type *Tys[] = { Addr->getType() };
7954  Intrinsic::ID Int =
7955      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
7956  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
7957
7958  return Builder.CreateTruncOrBitCast(
7959      Builder.CreateCall(Ldxr, Addr),
7960      cast<PointerType>(Addr->getType())->getElementType());
7961}
7962
7963Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
7964                                                   Value *Val, Value *Addr,
7965                                                   AtomicOrdering Ord) const {
7966  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
7967  bool IsRelease =
7968      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
7969
7970  // Since the intrinsics must have legal type, the i128 intrinsics take two
7971  // parameters: "i64, i64". We must marshal Val into the appropriate form
7972  // before the call.
7973  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
7974    Intrinsic::ID Int =
7975        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
7976    Function *Stxr = Intrinsic::getDeclaration(M, Int);
7977    Type *Int64Ty = Type::getInt64Ty(M->getContext());
7978
7979    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
7980    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
7981    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
7982    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
7983  }
7984
7985  Intrinsic::ID Int =
7986      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
7987  Type *Tys[] = { Addr->getType() };
7988  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
7989
7990  return Builder.CreateCall2(
7991      Stxr, Builder.CreateZExtOrBitCast(
7992                Val, Stxr->getFunctionType()->getParamType(0)),
7993      Addr);
7994}
7995