AArch64ISelLowering.cpp revision 36c7806f4eacd676932ba630246f88e0e37b1cd4
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that AArch64 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "aarch64-isel"
16#include "AArch64.h"
17#include "AArch64ISelLowering.h"
18#include "AArch64MachineFunctionInfo.h"
19#include "AArch64TargetMachine.h"
20#include "AArch64TargetObjectFile.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
28#include "llvm/IR/CallingConv.h"
29
30using namespace llvm;
31
32static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
33  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
34
35  if (Subtarget->isTargetLinux())
36    return new AArch64LinuxTargetObjectFile();
37  if (Subtarget->isTargetELF())
38    return new TargetLoweringObjectFileELF();
39  llvm_unreachable("unknown subtarget type");
40}
41
42AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
43  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
44
45  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
46
47  // SIMD compares set the entire lane's bits to 1
48  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
49
50  // Scalar register <-> type mapping
51  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
52  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
53
54  if (Subtarget->hasFPARMv8()) {
55    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
56    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
57    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
58    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
59  }
60
61  if (Subtarget->hasNEON()) {
62    // And the vectors
63    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
64    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
65    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
66    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
67    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
68    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
69    addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass);
70    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
71    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
72    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
73    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
74    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
75    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
76    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
77    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
78    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
79    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
80  }
81
82  computeRegisterProperties();
83
84  // We combine OR nodes for bitfield and NEON BSL operations.
85  setTargetDAGCombine(ISD::OR);
86
87  setTargetDAGCombine(ISD::AND);
88  setTargetDAGCombine(ISD::SRA);
89  setTargetDAGCombine(ISD::SRL);
90  setTargetDAGCombine(ISD::SHL);
91
92  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
93  setTargetDAGCombine(ISD::INTRINSIC_VOID);
94  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
95
96  // AArch64 does not have i1 loads, or much of anything for i1 really.
97  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
98  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
99  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
100
101  setStackPointerRegisterToSaveRestore(AArch64::XSP);
102  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
103  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
104  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
105
106  // We'll lower globals to wrappers for selection.
107  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
108  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
109
110  // A64 instructions have the comparison predicate attached to the user of the
111  // result, but having a separate comparison is valuable for matching.
112  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
113  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
114  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
115  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
116
117  setOperationAction(ISD::SELECT, MVT::i32, Custom);
118  setOperationAction(ISD::SELECT, MVT::i64, Custom);
119  setOperationAction(ISD::SELECT, MVT::f32, Custom);
120  setOperationAction(ISD::SELECT, MVT::f64, Custom);
121
122  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
123  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
124  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
125  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
126
127  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
128
129  setOperationAction(ISD::SETCC, MVT::i32, Custom);
130  setOperationAction(ISD::SETCC, MVT::i64, Custom);
131  setOperationAction(ISD::SETCC, MVT::f32, Custom);
132  setOperationAction(ISD::SETCC, MVT::f64, Custom);
133
134  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
135  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
136  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
137
138  setOperationAction(ISD::VASTART, MVT::Other, Custom);
139  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
140  setOperationAction(ISD::VAEND, MVT::Other, Expand);
141  setOperationAction(ISD::VAARG, MVT::Other, Expand);
142
143  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
144
145  setOperationAction(ISD::ROTL, MVT::i32, Expand);
146  setOperationAction(ISD::ROTL, MVT::i64, Expand);
147
148  setOperationAction(ISD::UREM, MVT::i32, Expand);
149  setOperationAction(ISD::UREM, MVT::i64, Expand);
150  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
151  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
152
153  setOperationAction(ISD::SREM, MVT::i32, Expand);
154  setOperationAction(ISD::SREM, MVT::i64, Expand);
155  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
156  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
157
158  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
159  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
160
161  // Legal floating-point operations.
162  setOperationAction(ISD::FABS, MVT::f32, Legal);
163  setOperationAction(ISD::FABS, MVT::f64, Legal);
164
165  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
166  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
167
168  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
169  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
170
171  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
172  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
173
174  setOperationAction(ISD::FNEG, MVT::f32, Legal);
175  setOperationAction(ISD::FNEG, MVT::f64, Legal);
176
177  setOperationAction(ISD::FRINT, MVT::f32, Legal);
178  setOperationAction(ISD::FRINT, MVT::f64, Legal);
179
180  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
181  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
182
183  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
184  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
185
186  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
187  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
188  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
189
190  // Illegal floating-point operations.
191  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
192  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
193
194  setOperationAction(ISD::FCOS, MVT::f32, Expand);
195  setOperationAction(ISD::FCOS, MVT::f64, Expand);
196
197  setOperationAction(ISD::FEXP, MVT::f32, Expand);
198  setOperationAction(ISD::FEXP, MVT::f64, Expand);
199
200  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
201  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
202
203  setOperationAction(ISD::FLOG, MVT::f32, Expand);
204  setOperationAction(ISD::FLOG, MVT::f64, Expand);
205
206  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
207  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
208
209  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
210  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
211
212  setOperationAction(ISD::FPOW, MVT::f32, Expand);
213  setOperationAction(ISD::FPOW, MVT::f64, Expand);
214
215  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
216  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
217
218  setOperationAction(ISD::FREM, MVT::f32, Expand);
219  setOperationAction(ISD::FREM, MVT::f64, Expand);
220
221  setOperationAction(ISD::FSIN, MVT::f32, Expand);
222  setOperationAction(ISD::FSIN, MVT::f64, Expand);
223
224  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
225  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
226
227  // Virtually no operation on f128 is legal, but LLVM can't expand them when
228  // there's a valid register class, so we need custom operations in most cases.
229  setOperationAction(ISD::FABS,       MVT::f128, Expand);
230  setOperationAction(ISD::FADD,       MVT::f128, Custom);
231  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
232  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
233  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
234  setOperationAction(ISD::FMA,        MVT::f128, Expand);
235  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
236  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
237  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
238  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
239  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
240  setOperationAction(ISD::FREM,       MVT::f128, Expand);
241  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
242  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
243  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
244  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
245  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
246  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
247  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
248  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
249  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
250  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
251  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
252
253  // Lowering for many of the conversions is actually specified by the non-f128
254  // type. The LowerXXX function will be trivial when f128 isn't involved.
255  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
256  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
257  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
258  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
259  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
260  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
261  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
262  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
263  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
264  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
265  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
266  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
267  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
268  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
269
270  // This prevents LLVM trying to compress double constants into a floating
271  // constant-pool entry and trying to load from there. It's of doubtful benefit
272  // for A64: we'd need LDR followed by FCVT, I believe.
273  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
274  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
275  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
276
277  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
278  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
279  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
280  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
281  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
282  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
283
284  setExceptionPointerRegister(AArch64::X0);
285  setExceptionSelectorRegister(AArch64::X1);
286
287  if (Subtarget->hasNEON()) {
288    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
289    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
290    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
291    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
292    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
293    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
294    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
295    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
296    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
297    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
298    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
299    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
300    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
301    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
302    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
303    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
304
305    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
306    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
307    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
308    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
309    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
310    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
311    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
312    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
313    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
314    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
315    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
316    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
317
318    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
319    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
320    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
321    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
322    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
323    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
324    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
325    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
326    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
327
328    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
329    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
330    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
331    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
332    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
333    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
334    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
335    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
336    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
337    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
338    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
339    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
340    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
341
342    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
343    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
344    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
345
346    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
347    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
348    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
349
350    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
351    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
352    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
353
354    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
355    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
356    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
357
358    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
359    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
360    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
361
362    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
363    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
364    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
365  }
366}
367
368EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
369  // It's reasonably important that this value matches the "natural" legal
370  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
371  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
372  if (!VT.isVector()) return MVT::i32;
373  return VT.changeVectorElementTypeToInteger();
374}
375
376static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
377                                  unsigned &LdrOpc,
378                                  unsigned &StrOpc) {
379  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
380                                       AArch64::LDXR_word, AArch64::LDXR_dword};
381  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
382                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
383  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
384                                       AArch64::STXR_word, AArch64::STXR_dword};
385  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
386                                     AArch64::STLXR_word, AArch64::STLXR_dword};
387
388  const unsigned *LoadOps, *StoreOps;
389  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
390    LoadOps = LoadAcqs;
391  else
392    LoadOps = LoadBares;
393
394  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
395    StoreOps = StoreRels;
396  else
397    StoreOps = StoreBares;
398
399  assert(isPowerOf2_32(Size) && Size <= 8 &&
400         "unsupported size for atomic binary op!");
401
402  LdrOpc = LoadOps[Log2_32(Size)];
403  StrOpc = StoreOps[Log2_32(Size)];
404}
405
406MachineBasicBlock *
407AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
408                                        unsigned Size,
409                                        unsigned BinOpcode) const {
410  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
411  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
412
413  const BasicBlock *LLVM_BB = BB->getBasicBlock();
414  MachineFunction *MF = BB->getParent();
415  MachineFunction::iterator It = BB;
416  ++It;
417
418  unsigned dest = MI->getOperand(0).getReg();
419  unsigned ptr = MI->getOperand(1).getReg();
420  unsigned incr = MI->getOperand(2).getReg();
421  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
422  DebugLoc dl = MI->getDebugLoc();
423
424  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
425
426  unsigned ldrOpc, strOpc;
427  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
428
429  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
430  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
431  MF->insert(It, loopMBB);
432  MF->insert(It, exitMBB);
433
434  // Transfer the remainder of BB and its successor edges to exitMBB.
435  exitMBB->splice(exitMBB->begin(), BB,
436                  llvm::next(MachineBasicBlock::iterator(MI)),
437                  BB->end());
438  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
439
440  const TargetRegisterClass *TRC
441    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
442  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
443
444  //  thisMBB:
445  //   ...
446  //   fallthrough --> loopMBB
447  BB->addSuccessor(loopMBB);
448
449  //  loopMBB:
450  //   ldxr dest, ptr
451  //   <binop> scratch, dest, incr
452  //   stxr stxr_status, scratch, ptr
453  //   cbnz stxr_status, loopMBB
454  //   fallthrough --> exitMBB
455  BB = loopMBB;
456  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
457  if (BinOpcode) {
458    // All arithmetic operations we'll be creating are designed to take an extra
459    // shift or extend operand, which we can conveniently set to zero.
460
461    // Operand order needs to go the other way for NAND.
462    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
463      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
464        .addReg(incr).addReg(dest).addImm(0);
465    else
466      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
467        .addReg(dest).addReg(incr).addImm(0);
468  }
469
470  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
471  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
472  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
473
474  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
475  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
476    .addReg(stxr_status).addMBB(loopMBB);
477
478  BB->addSuccessor(loopMBB);
479  BB->addSuccessor(exitMBB);
480
481  //  exitMBB:
482  //   ...
483  BB = exitMBB;
484
485  MI->eraseFromParent();   // The instruction is gone now.
486
487  return BB;
488}
489
490MachineBasicBlock *
491AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
492                                              MachineBasicBlock *BB,
493                                              unsigned Size,
494                                              unsigned CmpOp,
495                                              A64CC::CondCodes Cond) const {
496  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
497
498  const BasicBlock *LLVM_BB = BB->getBasicBlock();
499  MachineFunction *MF = BB->getParent();
500  MachineFunction::iterator It = BB;
501  ++It;
502
503  unsigned dest = MI->getOperand(0).getReg();
504  unsigned ptr = MI->getOperand(1).getReg();
505  unsigned incr = MI->getOperand(2).getReg();
506  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
507
508  unsigned oldval = dest;
509  DebugLoc dl = MI->getDebugLoc();
510
511  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
512  const TargetRegisterClass *TRC, *TRCsp;
513  if (Size == 8) {
514    TRC = &AArch64::GPR64RegClass;
515    TRCsp = &AArch64::GPR64xspRegClass;
516  } else {
517    TRC = &AArch64::GPR32RegClass;
518    TRCsp = &AArch64::GPR32wspRegClass;
519  }
520
521  unsigned ldrOpc, strOpc;
522  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
523
524  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
525  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
526  MF->insert(It, loopMBB);
527  MF->insert(It, exitMBB);
528
529  // Transfer the remainder of BB and its successor edges to exitMBB.
530  exitMBB->splice(exitMBB->begin(), BB,
531                  llvm::next(MachineBasicBlock::iterator(MI)),
532                  BB->end());
533  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
534
535  unsigned scratch = MRI.createVirtualRegister(TRC);
536  MRI.constrainRegClass(scratch, TRCsp);
537
538  //  thisMBB:
539  //   ...
540  //   fallthrough --> loopMBB
541  BB->addSuccessor(loopMBB);
542
543  //  loopMBB:
544  //   ldxr dest, ptr
545  //   cmp incr, dest (, sign extend if necessary)
546  //   csel scratch, dest, incr, cond
547  //   stxr stxr_status, scratch, ptr
548  //   cbnz stxr_status, loopMBB
549  //   fallthrough --> exitMBB
550  BB = loopMBB;
551  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
552
553  // Build compare and cmov instructions.
554  MRI.constrainRegClass(incr, TRCsp);
555  BuildMI(BB, dl, TII->get(CmpOp))
556    .addReg(incr).addReg(oldval).addImm(0);
557
558  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
559          scratch)
560    .addReg(oldval).addReg(incr).addImm(Cond);
561
562  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
563  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
564
565  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
566    .addReg(scratch).addReg(ptr);
567  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
568    .addReg(stxr_status).addMBB(loopMBB);
569
570  BB->addSuccessor(loopMBB);
571  BB->addSuccessor(exitMBB);
572
573  //  exitMBB:
574  //   ...
575  BB = exitMBB;
576
577  MI->eraseFromParent();   // The instruction is gone now.
578
579  return BB;
580}
581
582MachineBasicBlock *
583AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
584                                         MachineBasicBlock *BB,
585                                         unsigned Size) const {
586  unsigned dest    = MI->getOperand(0).getReg();
587  unsigned ptr     = MI->getOperand(1).getReg();
588  unsigned oldval  = MI->getOperand(2).getReg();
589  unsigned newval  = MI->getOperand(3).getReg();
590  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
591  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
592  DebugLoc dl = MI->getDebugLoc();
593
594  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
595  const TargetRegisterClass *TRCsp;
596  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
597
598  unsigned ldrOpc, strOpc;
599  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
600
601  MachineFunction *MF = BB->getParent();
602  const BasicBlock *LLVM_BB = BB->getBasicBlock();
603  MachineFunction::iterator It = BB;
604  ++It; // insert the new blocks after the current block
605
606  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
607  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
608  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
609  MF->insert(It, loop1MBB);
610  MF->insert(It, loop2MBB);
611  MF->insert(It, exitMBB);
612
613  // Transfer the remainder of BB and its successor edges to exitMBB.
614  exitMBB->splice(exitMBB->begin(), BB,
615                  llvm::next(MachineBasicBlock::iterator(MI)),
616                  BB->end());
617  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
618
619  //  thisMBB:
620  //   ...
621  //   fallthrough --> loop1MBB
622  BB->addSuccessor(loop1MBB);
623
624  // loop1MBB:
625  //   ldxr dest, [ptr]
626  //   cmp dest, oldval
627  //   b.ne exitMBB
628  BB = loop1MBB;
629  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
630
631  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
632  MRI.constrainRegClass(dest, TRCsp);
633  BuildMI(BB, dl, TII->get(CmpOp))
634    .addReg(dest).addReg(oldval).addImm(0);
635  BuildMI(BB, dl, TII->get(AArch64::Bcc))
636    .addImm(A64CC::NE).addMBB(exitMBB);
637  BB->addSuccessor(loop2MBB);
638  BB->addSuccessor(exitMBB);
639
640  // loop2MBB:
641  //   strex stxr_status, newval, [ptr]
642  //   cbnz stxr_status, loop1MBB
643  BB = loop2MBB;
644  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
645  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
646
647  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
648  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
649    .addReg(stxr_status).addMBB(loop1MBB);
650  BB->addSuccessor(loop1MBB);
651  BB->addSuccessor(exitMBB);
652
653  //  exitMBB:
654  //   ...
655  BB = exitMBB;
656
657  MI->eraseFromParent();   // The instruction is gone now.
658
659  return BB;
660}
661
662MachineBasicBlock *
663AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
664                                    MachineBasicBlock *MBB) const {
665  // We materialise the F128CSEL pseudo-instruction using conditional branches
666  // and loads, giving an instruciton sequence like:
667  //     str q0, [sp]
668  //     b.ne IfTrue
669  //     b Finish
670  // IfTrue:
671  //     str q1, [sp]
672  // Finish:
673  //     ldr q0, [sp]
674  //
675  // Using virtual registers would probably not be beneficial since COPY
676  // instructions are expensive for f128 (there's no actual instruction to
677  // implement them).
678  //
679  // An alternative would be to do an integer-CSEL on some address. E.g.:
680  //     mov x0, sp
681  //     add x1, sp, #16
682  //     str q0, [x0]
683  //     str q1, [x1]
684  //     csel x0, x0, x1, ne
685  //     ldr q0, [x0]
686  //
687  // It's unclear which approach is actually optimal.
688  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
689  MachineFunction *MF = MBB->getParent();
690  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
691  DebugLoc DL = MI->getDebugLoc();
692  MachineFunction::iterator It = MBB;
693  ++It;
694
695  unsigned DestReg = MI->getOperand(0).getReg();
696  unsigned IfTrueReg = MI->getOperand(1).getReg();
697  unsigned IfFalseReg = MI->getOperand(2).getReg();
698  unsigned CondCode = MI->getOperand(3).getImm();
699  bool NZCVKilled = MI->getOperand(4).isKill();
700
701  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
702  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
703  MF->insert(It, TrueBB);
704  MF->insert(It, EndBB);
705
706  // Transfer rest of current basic-block to EndBB
707  EndBB->splice(EndBB->begin(), MBB,
708                llvm::next(MachineBasicBlock::iterator(MI)),
709                MBB->end());
710  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
711
712  // We need somewhere to store the f128 value needed.
713  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
714
715  //     [... start of incoming MBB ...]
716  //     str qIFFALSE, [sp]
717  //     b.cc IfTrue
718  //     b Done
719  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
720    .addReg(IfFalseReg)
721    .addFrameIndex(ScratchFI)
722    .addImm(0);
723  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
724    .addImm(CondCode)
725    .addMBB(TrueBB);
726  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
727    .addMBB(EndBB);
728  MBB->addSuccessor(TrueBB);
729  MBB->addSuccessor(EndBB);
730
731  if (!NZCVKilled) {
732    // NZCV is live-through TrueBB.
733    TrueBB->addLiveIn(AArch64::NZCV);
734    EndBB->addLiveIn(AArch64::NZCV);
735  }
736
737  // IfTrue:
738  //     str qIFTRUE, [sp]
739  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
740    .addReg(IfTrueReg)
741    .addFrameIndex(ScratchFI)
742    .addImm(0);
743
744  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
745  // blocks.
746  TrueBB->addSuccessor(EndBB);
747
748  // Done:
749  //     ldr qDEST, [sp]
750  //     [... rest of incoming MBB ...]
751  MachineInstr *StartOfEnd = EndBB->begin();
752  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
753    .addFrameIndex(ScratchFI)
754    .addImm(0);
755
756  MI->eraseFromParent();
757  return EndBB;
758}
759
760MachineBasicBlock *
761AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
762                                                 MachineBasicBlock *MBB) const {
763  switch (MI->getOpcode()) {
764  default: llvm_unreachable("Unhandled instruction with custom inserter");
765  case AArch64::F128CSEL:
766    return EmitF128CSEL(MI, MBB);
767  case AArch64::ATOMIC_LOAD_ADD_I8:
768    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
769  case AArch64::ATOMIC_LOAD_ADD_I16:
770    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
771  case AArch64::ATOMIC_LOAD_ADD_I32:
772    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
773  case AArch64::ATOMIC_LOAD_ADD_I64:
774    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
775
776  case AArch64::ATOMIC_LOAD_SUB_I8:
777    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
778  case AArch64::ATOMIC_LOAD_SUB_I16:
779    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
780  case AArch64::ATOMIC_LOAD_SUB_I32:
781    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
782  case AArch64::ATOMIC_LOAD_SUB_I64:
783    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
784
785  case AArch64::ATOMIC_LOAD_AND_I8:
786    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
787  case AArch64::ATOMIC_LOAD_AND_I16:
788    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
789  case AArch64::ATOMIC_LOAD_AND_I32:
790    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
791  case AArch64::ATOMIC_LOAD_AND_I64:
792    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
793
794  case AArch64::ATOMIC_LOAD_OR_I8:
795    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
796  case AArch64::ATOMIC_LOAD_OR_I16:
797    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
798  case AArch64::ATOMIC_LOAD_OR_I32:
799    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
800  case AArch64::ATOMIC_LOAD_OR_I64:
801    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
802
803  case AArch64::ATOMIC_LOAD_XOR_I8:
804    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
805  case AArch64::ATOMIC_LOAD_XOR_I16:
806    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
807  case AArch64::ATOMIC_LOAD_XOR_I32:
808    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
809  case AArch64::ATOMIC_LOAD_XOR_I64:
810    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
811
812  case AArch64::ATOMIC_LOAD_NAND_I8:
813    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
814  case AArch64::ATOMIC_LOAD_NAND_I16:
815    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
816  case AArch64::ATOMIC_LOAD_NAND_I32:
817    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
818  case AArch64::ATOMIC_LOAD_NAND_I64:
819    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
820
821  case AArch64::ATOMIC_LOAD_MIN_I8:
822    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
823  case AArch64::ATOMIC_LOAD_MIN_I16:
824    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
825  case AArch64::ATOMIC_LOAD_MIN_I32:
826    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
827  case AArch64::ATOMIC_LOAD_MIN_I64:
828    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
829
830  case AArch64::ATOMIC_LOAD_MAX_I8:
831    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
832  case AArch64::ATOMIC_LOAD_MAX_I16:
833    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
834  case AArch64::ATOMIC_LOAD_MAX_I32:
835    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
836  case AArch64::ATOMIC_LOAD_MAX_I64:
837    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
838
839  case AArch64::ATOMIC_LOAD_UMIN_I8:
840    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
841  case AArch64::ATOMIC_LOAD_UMIN_I16:
842    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
843  case AArch64::ATOMIC_LOAD_UMIN_I32:
844    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
845  case AArch64::ATOMIC_LOAD_UMIN_I64:
846    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
847
848  case AArch64::ATOMIC_LOAD_UMAX_I8:
849    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
850  case AArch64::ATOMIC_LOAD_UMAX_I16:
851    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
852  case AArch64::ATOMIC_LOAD_UMAX_I32:
853    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
854  case AArch64::ATOMIC_LOAD_UMAX_I64:
855    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
856
857  case AArch64::ATOMIC_SWAP_I8:
858    return emitAtomicBinary(MI, MBB, 1, 0);
859  case AArch64::ATOMIC_SWAP_I16:
860    return emitAtomicBinary(MI, MBB, 2, 0);
861  case AArch64::ATOMIC_SWAP_I32:
862    return emitAtomicBinary(MI, MBB, 4, 0);
863  case AArch64::ATOMIC_SWAP_I64:
864    return emitAtomicBinary(MI, MBB, 8, 0);
865
866  case AArch64::ATOMIC_CMP_SWAP_I8:
867    return emitAtomicCmpSwap(MI, MBB, 1);
868  case AArch64::ATOMIC_CMP_SWAP_I16:
869    return emitAtomicCmpSwap(MI, MBB, 2);
870  case AArch64::ATOMIC_CMP_SWAP_I32:
871    return emitAtomicCmpSwap(MI, MBB, 4);
872  case AArch64::ATOMIC_CMP_SWAP_I64:
873    return emitAtomicCmpSwap(MI, MBB, 8);
874  }
875}
876
877
878const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
879  switch (Opcode) {
880  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
881  case AArch64ISD::Call:           return "AArch64ISD::Call";
882  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
883  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
884  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
885  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
886  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
887  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
888  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
889  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
890  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
891  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
892  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
893  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
894  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
895
896  case AArch64ISD::NEON_BSL:
897    return "AArch64ISD::NEON_BSL";
898  case AArch64ISD::NEON_MOVIMM:
899    return "AArch64ISD::NEON_MOVIMM";
900  case AArch64ISD::NEON_MVNIMM:
901    return "AArch64ISD::NEON_MVNIMM";
902  case AArch64ISD::NEON_FMOVIMM:
903    return "AArch64ISD::NEON_FMOVIMM";
904  case AArch64ISD::NEON_CMP:
905    return "AArch64ISD::NEON_CMP";
906  case AArch64ISD::NEON_CMPZ:
907    return "AArch64ISD::NEON_CMPZ";
908  case AArch64ISD::NEON_TST:
909    return "AArch64ISD::NEON_TST";
910  case AArch64ISD::NEON_QSHLs:
911    return "AArch64ISD::NEON_QSHLs";
912  case AArch64ISD::NEON_QSHLu:
913    return "AArch64ISD::NEON_QSHLu";
914  case AArch64ISD::NEON_VDUP:
915    return "AArch64ISD::NEON_VDUP";
916  case AArch64ISD::NEON_VDUPLANE:
917    return "AArch64ISD::NEON_VDUPLANE";
918  case AArch64ISD::NEON_REV16:
919    return "AArch64ISD::NEON_REV16";
920  case AArch64ISD::NEON_REV32:
921    return "AArch64ISD::NEON_REV32";
922  case AArch64ISD::NEON_REV64:
923    return "AArch64ISD::NEON_REV64";
924  case AArch64ISD::NEON_LD1_UPD:
925    return "AArch64ISD::NEON_LD1_UPD";
926  case AArch64ISD::NEON_LD2_UPD:
927    return "AArch64ISD::NEON_LD2_UPD";
928  case AArch64ISD::NEON_LD3_UPD:
929    return "AArch64ISD::NEON_LD3_UPD";
930  case AArch64ISD::NEON_LD4_UPD:
931    return "AArch64ISD::NEON_LD4_UPD";
932  case AArch64ISD::NEON_ST1_UPD:
933    return "AArch64ISD::NEON_ST1_UPD";
934  case AArch64ISD::NEON_ST2_UPD:
935    return "AArch64ISD::NEON_ST2_UPD";
936  case AArch64ISD::NEON_ST3_UPD:
937    return "AArch64ISD::NEON_ST3_UPD";
938  case AArch64ISD::NEON_ST4_UPD:
939    return "AArch64ISD::NEON_ST4_UPD";
940  case AArch64ISD::NEON_LD1x2_UPD:
941    return "AArch64ISD::NEON_LD1x2_UPD";
942  case AArch64ISD::NEON_LD1x3_UPD:
943    return "AArch64ISD::NEON_LD1x3_UPD";
944  case AArch64ISD::NEON_LD1x4_UPD:
945    return "AArch64ISD::NEON_LD1x4_UPD";
946  case AArch64ISD::NEON_ST1x2_UPD:
947    return "AArch64ISD::NEON_ST1x2_UPD";
948  case AArch64ISD::NEON_ST1x3_UPD:
949    return "AArch64ISD::NEON_ST1x3_UPD";
950  case AArch64ISD::NEON_ST1x4_UPD:
951    return "AArch64ISD::NEON_ST1x4_UPD";
952  case AArch64ISD::NEON_LD2DUP:
953    return "AArch64ISD::NEON_LD2DUP";
954  case AArch64ISD::NEON_LD3DUP:
955    return "AArch64ISD::NEON_LD3DUP";
956  case AArch64ISD::NEON_LD4DUP:
957    return "AArch64ISD::NEON_LD4DUP";
958  case AArch64ISD::NEON_LD2DUP_UPD:
959    return "AArch64ISD::NEON_LD2DUP_UPD";
960  case AArch64ISD::NEON_LD3DUP_UPD:
961    return "AArch64ISD::NEON_LD3DUP_UPD";
962  case AArch64ISD::NEON_LD4DUP_UPD:
963    return "AArch64ISD::NEON_LD4DUP_UPD";
964  case AArch64ISD::NEON_LD2LN_UPD:
965    return "AArch64ISD::NEON_LD2LN_UPD";
966  case AArch64ISD::NEON_LD3LN_UPD:
967    return "AArch64ISD::NEON_LD3LN_UPD";
968  case AArch64ISD::NEON_LD4LN_UPD:
969    return "AArch64ISD::NEON_LD4LN_UPD";
970  case AArch64ISD::NEON_ST2LN_UPD:
971    return "AArch64ISD::NEON_ST2LN_UPD";
972  case AArch64ISD::NEON_ST3LN_UPD:
973    return "AArch64ISD::NEON_ST3LN_UPD";
974  case AArch64ISD::NEON_ST4LN_UPD:
975    return "AArch64ISD::NEON_ST4LN_UPD";
976  case AArch64ISD::NEON_VEXTRACT:
977    return "AArch64ISD::NEON_VEXTRACT";
978  default:
979    return NULL;
980  }
981}
982
983static const uint16_t AArch64FPRArgRegs[] = {
984  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
985  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
986};
987static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
988
989static const uint16_t AArch64ArgRegs[] = {
990  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
991  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
992};
993static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
994
995static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
996                                 CCValAssign::LocInfo LocInfo,
997                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
998  // Mark all remaining general purpose registers as allocated. We don't
999  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
1000  // i64 will go in registers (C.11).
1001  for (unsigned i = 0; i < NumArgRegs; ++i)
1002    State.AllocateReg(AArch64ArgRegs[i]);
1003
1004  return false;
1005}
1006
1007#include "AArch64GenCallingConv.inc"
1008
1009CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1010
1011  switch(CC) {
1012  default: llvm_unreachable("Unsupported calling convention");
1013  case CallingConv::Fast:
1014  case CallingConv::C:
1015    return CC_A64_APCS;
1016  }
1017}
1018
1019void
1020AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
1021                                           SDLoc DL, SDValue &Chain) const {
1022  MachineFunction &MF = DAG.getMachineFunction();
1023  MachineFrameInfo *MFI = MF.getFrameInfo();
1024  AArch64MachineFunctionInfo *FuncInfo
1025    = MF.getInfo<AArch64MachineFunctionInfo>();
1026
1027  SmallVector<SDValue, 8> MemOps;
1028
1029  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
1030                                                         NumArgRegs);
1031  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
1032                                                         NumFPRArgRegs);
1033
1034  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
1035  int GPRIdx = 0;
1036  if (GPRSaveSize != 0) {
1037    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
1038
1039    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
1040
1041    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
1042      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
1043      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
1044      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1045                                   MachinePointerInfo::getStack(i * 8),
1046                                   false, false, 0);
1047      MemOps.push_back(Store);
1048      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1049                        DAG.getConstant(8, getPointerTy()));
1050    }
1051  }
1052
1053  if (getSubtarget()->hasFPARMv8()) {
1054  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
1055  int FPRIdx = 0;
1056    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
1057    // can omit a register save area if we know we'll never use registers of
1058    // that class.
1059    if (FPRSaveSize != 0) {
1060      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
1061
1062      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
1063
1064      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
1065        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
1066            &AArch64::FPR128RegClass);
1067        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
1068        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1069            MachinePointerInfo::getStack(i * 16),
1070            false, false, 0);
1071        MemOps.push_back(Store);
1072        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1073            DAG.getConstant(16, getPointerTy()));
1074      }
1075    }
1076    FuncInfo->setVariadicFPRIdx(FPRIdx);
1077    FuncInfo->setVariadicFPRSize(FPRSaveSize);
1078  }
1079
1080  int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
1081
1082  FuncInfo->setVariadicStackIdx(StackIdx);
1083  FuncInfo->setVariadicGPRIdx(GPRIdx);
1084  FuncInfo->setVariadicGPRSize(GPRSaveSize);
1085
1086  if (!MemOps.empty()) {
1087    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
1088                        MemOps.size());
1089  }
1090}
1091
1092
1093SDValue
1094AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
1095                                      CallingConv::ID CallConv, bool isVarArg,
1096                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1097                                      SDLoc dl, SelectionDAG &DAG,
1098                                      SmallVectorImpl<SDValue> &InVals) const {
1099  MachineFunction &MF = DAG.getMachineFunction();
1100  AArch64MachineFunctionInfo *FuncInfo
1101    = MF.getInfo<AArch64MachineFunctionInfo>();
1102  MachineFrameInfo *MFI = MF.getFrameInfo();
1103  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1104
1105  SmallVector<CCValAssign, 16> ArgLocs;
1106  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1107                 getTargetMachine(), ArgLocs, *DAG.getContext());
1108  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1109
1110  SmallVector<SDValue, 16> ArgValues;
1111
1112  SDValue ArgValue;
1113  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1114    CCValAssign &VA = ArgLocs[i];
1115    ISD::ArgFlagsTy Flags = Ins[i].Flags;
1116
1117    if (Flags.isByVal()) {
1118      // Byval is used for small structs and HFAs in the PCS, but the system
1119      // should work in a non-compliant manner for larger structs.
1120      EVT PtrTy = getPointerTy();
1121      int Size = Flags.getByValSize();
1122      unsigned NumRegs = (Size + 7) / 8;
1123
1124      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
1125                                                 VA.getLocMemOffset(),
1126                                                 false);
1127      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
1128      InVals.push_back(FrameIdxN);
1129
1130      continue;
1131    } else if (VA.isRegLoc()) {
1132      MVT RegVT = VA.getLocVT();
1133      const TargetRegisterClass *RC = getRegClassFor(RegVT);
1134      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1135
1136      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1137    } else { // VA.isRegLoc()
1138      assert(VA.isMemLoc());
1139
1140      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
1141                                      VA.getLocMemOffset(), true);
1142
1143      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1144      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
1145                             MachinePointerInfo::getFixedStack(FI),
1146                             false, false, false, 0);
1147
1148
1149    }
1150
1151    switch (VA.getLocInfo()) {
1152    default: llvm_unreachable("Unknown loc info!");
1153    case CCValAssign::Full: break;
1154    case CCValAssign::BCvt:
1155      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
1156      break;
1157    case CCValAssign::SExt:
1158    case CCValAssign::ZExt:
1159    case CCValAssign::AExt: {
1160      unsigned DestSize = VA.getValVT().getSizeInBits();
1161      unsigned DestSubReg;
1162
1163      switch (DestSize) {
1164      case 8: DestSubReg = AArch64::sub_8; break;
1165      case 16: DestSubReg = AArch64::sub_16; break;
1166      case 32: DestSubReg = AArch64::sub_32; break;
1167      case 64: DestSubReg = AArch64::sub_64; break;
1168      default: llvm_unreachable("Unexpected argument promotion");
1169      }
1170
1171      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1172                                   VA.getValVT(), ArgValue,
1173                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
1174                         0);
1175      break;
1176    }
1177    }
1178
1179    InVals.push_back(ArgValue);
1180  }
1181
1182  if (isVarArg)
1183    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
1184
1185  unsigned StackArgSize = CCInfo.getNextStackOffset();
1186  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
1187    // This is a non-standard ABI so by fiat I say we're allowed to make full
1188    // use of the stack area to be popped, which must be aligned to 16 bytes in
1189    // any case:
1190    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
1191
1192    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
1193    // a multiple of 16.
1194    FuncInfo->setArgumentStackToRestore(StackArgSize);
1195
1196    // This realignment carries over to the available bytes below. Our own
1197    // callers will guarantee the space is free by giving an aligned value to
1198    // CALLSEQ_START.
1199  }
1200  // Even if we're not expected to free up the space, it's useful to know how
1201  // much is there while considering tail calls (because we can reuse it).
1202  FuncInfo->setBytesInStackArgArea(StackArgSize);
1203
1204  return Chain;
1205}
1206
1207SDValue
1208AArch64TargetLowering::LowerReturn(SDValue Chain,
1209                                   CallingConv::ID CallConv, bool isVarArg,
1210                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
1211                                   const SmallVectorImpl<SDValue> &OutVals,
1212                                   SDLoc dl, SelectionDAG &DAG) const {
1213  // CCValAssign - represent the assignment of the return value to a location.
1214  SmallVector<CCValAssign, 16> RVLocs;
1215
1216  // CCState - Info about the registers and stack slots.
1217  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1218                 getTargetMachine(), RVLocs, *DAG.getContext());
1219
1220  // Analyze outgoing return values.
1221  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
1222
1223  SDValue Flag;
1224  SmallVector<SDValue, 4> RetOps(1, Chain);
1225
1226  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1227    // PCS: "If the type, T, of the result of a function is such that
1228    // void func(T arg) would require that arg be passed as a value in a
1229    // register (or set of registers) according to the rules in 5.4, then the
1230    // result is returned in the same registers as would be used for such an
1231    // argument.
1232    //
1233    // Otherwise, the caller shall reserve a block of memory of sufficient
1234    // size and alignment to hold the result. The address of the memory block
1235    // shall be passed as an additional argument to the function in x8."
1236    //
1237    // This is implemented in two places. The register-return values are dealt
1238    // with here, more complex returns are passed as an sret parameter, which
1239    // means we don't have to worry about it during actual return.
1240    CCValAssign &VA = RVLocs[i];
1241    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
1242
1243
1244    SDValue Arg = OutVals[i];
1245
1246    // There's no convenient note in the ABI about this as there is for normal
1247    // arguments, but it says return values are passed in the same registers as
1248    // an argument would be. I believe that includes the comments about
1249    // unspecified higher bits, putting the burden of widening on the *caller*
1250    // for return values.
1251    switch (VA.getLocInfo()) {
1252    default: llvm_unreachable("Unknown loc info");
1253    case CCValAssign::Full: break;
1254    case CCValAssign::SExt:
1255    case CCValAssign::ZExt:
1256    case CCValAssign::AExt:
1257      // Floating-point values should only be extended when they're going into
1258      // memory, which can't happen here so an integer extend is acceptable.
1259      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1260      break;
1261    case CCValAssign::BCvt:
1262      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1263      break;
1264    }
1265
1266    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1267    Flag = Chain.getValue(1);
1268    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1269  }
1270
1271  RetOps[0] = Chain;  // Update chain.
1272
1273  // Add the flag if we have it.
1274  if (Flag.getNode())
1275    RetOps.push_back(Flag);
1276
1277  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
1278                     &RetOps[0], RetOps.size());
1279}
1280
1281SDValue
1282AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
1283                                 SmallVectorImpl<SDValue> &InVals) const {
1284  SelectionDAG &DAG                     = CLI.DAG;
1285  SDLoc &dl                             = CLI.DL;
1286  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1287  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1288  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1289  SDValue Chain                         = CLI.Chain;
1290  SDValue Callee                        = CLI.Callee;
1291  bool &IsTailCall                      = CLI.IsTailCall;
1292  CallingConv::ID CallConv              = CLI.CallConv;
1293  bool IsVarArg                         = CLI.IsVarArg;
1294
1295  MachineFunction &MF = DAG.getMachineFunction();
1296  AArch64MachineFunctionInfo *FuncInfo
1297    = MF.getInfo<AArch64MachineFunctionInfo>();
1298  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1299  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
1300  bool IsSibCall = false;
1301
1302  if (IsTailCall) {
1303    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1304                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1305                                                   Outs, OutVals, Ins, DAG);
1306
1307    // A sibling call is one where we're under the usual C ABI and not planning
1308    // to change that but can still do a tail call:
1309    if (!TailCallOpt && IsTailCall)
1310      IsSibCall = true;
1311  }
1312
1313  SmallVector<CCValAssign, 16> ArgLocs;
1314  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1315                 getTargetMachine(), ArgLocs, *DAG.getContext());
1316  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1317
1318  // On AArch64 (and all other architectures I'm aware of) the most this has to
1319  // do is adjust the stack pointer.
1320  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
1321  if (IsSibCall) {
1322    // Since we're not changing the ABI to make this a tail call, the memory
1323    // operands are already available in the caller's incoming argument space.
1324    NumBytes = 0;
1325  }
1326
1327  // FPDiff is the byte offset of the call's argument area from the callee's.
1328  // Stores to callee stack arguments will be placed in FixedStackSlots offset
1329  // by this amount for a tail call. In a sibling call it must be 0 because the
1330  // caller will deallocate the entire stack and the callee still expects its
1331  // arguments to begin at SP+0. Completely unused for non-tail calls.
1332  int FPDiff = 0;
1333
1334  if (IsTailCall && !IsSibCall) {
1335    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1336
1337    // FPDiff will be negative if this tail call requires more space than we
1338    // would automatically have in our incoming argument space. Positive if we
1339    // can actually shrink the stack.
1340    FPDiff = NumReusableBytes - NumBytes;
1341
1342    // The stack pointer must be 16-byte aligned at all times it's used for a
1343    // memory operation, which in practice means at *all* times and in
1344    // particular across call boundaries. Therefore our own arguments started at
1345    // a 16-byte aligned SP and the delta applied for the tail call should
1346    // satisfy the same constraint.
1347    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
1348  }
1349
1350  if (!IsSibCall)
1351    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1352                                 dl);
1353
1354  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
1355                                        getPointerTy());
1356
1357  SmallVector<SDValue, 8> MemOpChains;
1358  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1359
1360  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1361    CCValAssign &VA = ArgLocs[i];
1362    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1363    SDValue Arg = OutVals[i];
1364
1365    // Callee does the actual widening, so all extensions just use an implicit
1366    // definition of the rest of the Loc. Aesthetically, this would be nicer as
1367    // an ANY_EXTEND, but that isn't valid for floating-point types and this
1368    // alternative works on integer types too.
1369    switch (VA.getLocInfo()) {
1370    default: llvm_unreachable("Unknown loc info!");
1371    case CCValAssign::Full: break;
1372    case CCValAssign::SExt:
1373    case CCValAssign::ZExt:
1374    case CCValAssign::AExt: {
1375      unsigned SrcSize = VA.getValVT().getSizeInBits();
1376      unsigned SrcSubReg;
1377
1378      switch (SrcSize) {
1379      case 8: SrcSubReg = AArch64::sub_8; break;
1380      case 16: SrcSubReg = AArch64::sub_16; break;
1381      case 32: SrcSubReg = AArch64::sub_32; break;
1382      case 64: SrcSubReg = AArch64::sub_64; break;
1383      default: llvm_unreachable("Unexpected argument promotion");
1384      }
1385
1386      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
1387                                    VA.getLocVT(),
1388                                    DAG.getUNDEF(VA.getLocVT()),
1389                                    Arg,
1390                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
1391                    0);
1392
1393      break;
1394    }
1395    case CCValAssign::BCvt:
1396      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1397      break;
1398    }
1399
1400    if (VA.isRegLoc()) {
1401      // A normal register (sub-) argument. For now we just note it down because
1402      // we want to copy things into registers as late as possible to avoid
1403      // register-pressure (and possibly worse).
1404      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1405      continue;
1406    }
1407
1408    assert(VA.isMemLoc() && "unexpected argument location");
1409
1410    SDValue DstAddr;
1411    MachinePointerInfo DstInfo;
1412    if (IsTailCall) {
1413      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
1414                                          VA.getLocVT().getSizeInBits();
1415      OpSize = (OpSize + 7) / 8;
1416      int32_t Offset = VA.getLocMemOffset() + FPDiff;
1417      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
1418
1419      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
1420      DstInfo = MachinePointerInfo::getFixedStack(FI);
1421
1422      // Make sure any stack arguments overlapping with where we're storing are
1423      // loaded before this eventual operation. Otherwise they'll be clobbered.
1424      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
1425    } else {
1426      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
1427
1428      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1429      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
1430    }
1431
1432    if (Flags.isByVal()) {
1433      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
1434      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
1435                                  Flags.getByValAlign(),
1436                                  /*isVolatile = */ false,
1437                                  /*alwaysInline = */ false,
1438                                  DstInfo, MachinePointerInfo(0));
1439      MemOpChains.push_back(Cpy);
1440    } else {
1441      // Normal stack argument, put it where it's needed.
1442      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
1443                                   false, false, 0);
1444      MemOpChains.push_back(Store);
1445    }
1446  }
1447
1448  // The loads and stores generated above shouldn't clash with each
1449  // other. Combining them with this TokenFactor notes that fact for the rest of
1450  // the backend.
1451  if (!MemOpChains.empty())
1452    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1453                        &MemOpChains[0], MemOpChains.size());
1454
1455  // Most of the rest of the instructions need to be glued together; we don't
1456  // want assignments to actual registers used by a call to be rearranged by a
1457  // well-meaning scheduler.
1458  SDValue InFlag;
1459
1460  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1461    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1462                             RegsToPass[i].second, InFlag);
1463    InFlag = Chain.getValue(1);
1464  }
1465
1466  // The linker is responsible for inserting veneers when necessary to put a
1467  // function call destination in range, so we don't need to bother with a
1468  // wrapper here.
1469  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1470    const GlobalValue *GV = G->getGlobal();
1471    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
1472  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1473    const char *Sym = S->getSymbol();
1474    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
1475  }
1476
1477  // We don't usually want to end the call-sequence here because we would tidy
1478  // the frame up *after* the call, however in the ABI-changing tail-call case
1479  // we've carefully laid out the parameters so that when sp is reset they'll be
1480  // in the correct location.
1481  if (IsTailCall && !IsSibCall) {
1482    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1483                               DAG.getIntPtrConstant(0, true), InFlag, dl);
1484    InFlag = Chain.getValue(1);
1485  }
1486
1487  // We produce the following DAG scheme for the actual call instruction:
1488  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
1489  //
1490  // Most arguments aren't going to be used and just keep the values live as
1491  // far as LLVM is concerned. It's expected to be selected as simply "bl
1492  // callee" (for a direct, non-tail call).
1493  std::vector<SDValue> Ops;
1494  Ops.push_back(Chain);
1495  Ops.push_back(Callee);
1496
1497  if (IsTailCall) {
1498    // Each tail call may have to adjust the stack by a different amount, so
1499    // this information must travel along with the operation for eventual
1500    // consumption by emitEpilogue.
1501    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
1502  }
1503
1504  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1505    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1506                                  RegsToPass[i].second.getValueType()));
1507
1508
1509  // Add a register mask operand representing the call-preserved registers. This
1510  // is used later in codegen to constrain register-allocation.
1511  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1512  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
1513  assert(Mask && "Missing call preserved mask for calling convention");
1514  Ops.push_back(DAG.getRegisterMask(Mask));
1515
1516  // If we needed glue, put it in as the last argument.
1517  if (InFlag.getNode())
1518    Ops.push_back(InFlag);
1519
1520  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1521
1522  if (IsTailCall) {
1523    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1524  }
1525
1526  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
1527  InFlag = Chain.getValue(1);
1528
1529  // Now we can reclaim the stack, just as well do it before working out where
1530  // our return value is.
1531  if (!IsSibCall) {
1532    uint64_t CalleePopBytes
1533      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
1534
1535    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1536                               DAG.getIntPtrConstant(CalleePopBytes, true),
1537                               InFlag, dl);
1538    InFlag = Chain.getValue(1);
1539  }
1540
1541  return LowerCallResult(Chain, InFlag, CallConv,
1542                         IsVarArg, Ins, dl, DAG, InVals);
1543}
1544
1545SDValue
1546AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1547                                      CallingConv::ID CallConv, bool IsVarArg,
1548                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1549                                      SDLoc dl, SelectionDAG &DAG,
1550                                      SmallVectorImpl<SDValue> &InVals) const {
1551  // Assign locations to each value returned by this call.
1552  SmallVector<CCValAssign, 16> RVLocs;
1553  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1554                 getTargetMachine(), RVLocs, *DAG.getContext());
1555  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
1556
1557  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1558    CCValAssign VA = RVLocs[i];
1559
1560    // Return values that are too big to fit into registers should use an sret
1561    // pointer, so this can be a lot simpler than the main argument code.
1562    assert(VA.isRegLoc() && "Memory locations not expected for call return");
1563
1564    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1565                                     InFlag);
1566    Chain = Val.getValue(1);
1567    InFlag = Val.getValue(2);
1568
1569    switch (VA.getLocInfo()) {
1570    default: llvm_unreachable("Unknown loc info!");
1571    case CCValAssign::Full: break;
1572    case CCValAssign::BCvt:
1573      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1574      break;
1575    case CCValAssign::ZExt:
1576    case CCValAssign::SExt:
1577    case CCValAssign::AExt:
1578      // Floating-point arguments only get extended/truncated if they're going
1579      // in memory, so using the integer operation is acceptable here.
1580      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1581      break;
1582    }
1583
1584    InVals.push_back(Val);
1585  }
1586
1587  return Chain;
1588}
1589
1590bool
1591AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1592                                    CallingConv::ID CalleeCC,
1593                                    bool IsVarArg,
1594                                    bool IsCalleeStructRet,
1595                                    bool IsCallerStructRet,
1596                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1597                                    const SmallVectorImpl<SDValue> &OutVals,
1598                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1599                                    SelectionDAG& DAG) const {
1600
1601  // For CallingConv::C this function knows whether the ABI needs
1602  // changing. That's not true for other conventions so they will have to opt in
1603  // manually.
1604  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1605    return false;
1606
1607  const MachineFunction &MF = DAG.getMachineFunction();
1608  const Function *CallerF = MF.getFunction();
1609  CallingConv::ID CallerCC = CallerF->getCallingConv();
1610  bool CCMatch = CallerCC == CalleeCC;
1611
1612  // Byval parameters hand the function a pointer directly into the stack area
1613  // we want to reuse during a tail call. Working around this *is* possible (see
1614  // X86) but less efficient and uglier in LowerCall.
1615  for (Function::const_arg_iterator i = CallerF->arg_begin(),
1616         e = CallerF->arg_end(); i != e; ++i)
1617    if (i->hasByValAttr())
1618      return false;
1619
1620  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
1621    if (IsTailCallConvention(CalleeCC) && CCMatch)
1622      return true;
1623    return false;
1624  }
1625
1626  // Now we search for cases where we can use a tail call without changing the
1627  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
1628  // concept.
1629
1630  // I want anyone implementing a new calling convention to think long and hard
1631  // about this assert.
1632  assert((!IsVarArg || CalleeCC == CallingConv::C)
1633         && "Unexpected variadic calling convention");
1634
1635  if (IsVarArg && !Outs.empty()) {
1636    // At least two cases here: if caller is fastcc then we can't have any
1637    // memory arguments (we'd be expected to clean up the stack afterwards). If
1638    // caller is C then we could potentially use its argument area.
1639
1640    // FIXME: for now we take the most conservative of these in both cases:
1641    // disallow all variadic memory operands.
1642    SmallVector<CCValAssign, 16> ArgLocs;
1643    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1644                   getTargetMachine(), ArgLocs, *DAG.getContext());
1645
1646    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1647    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
1648      if (!ArgLocs[i].isRegLoc())
1649        return false;
1650  }
1651
1652  // If the calling conventions do not match, then we'd better make sure the
1653  // results are returned in the same way as what the caller expects.
1654  if (!CCMatch) {
1655    SmallVector<CCValAssign, 16> RVLocs1;
1656    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1657                    getTargetMachine(), RVLocs1, *DAG.getContext());
1658    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
1659
1660    SmallVector<CCValAssign, 16> RVLocs2;
1661    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1662                    getTargetMachine(), RVLocs2, *DAG.getContext());
1663    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
1664
1665    if (RVLocs1.size() != RVLocs2.size())
1666      return false;
1667    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1668      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1669        return false;
1670      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1671        return false;
1672      if (RVLocs1[i].isRegLoc()) {
1673        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1674          return false;
1675      } else {
1676        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1677          return false;
1678      }
1679    }
1680  }
1681
1682  // Nothing more to check if the callee is taking no arguments
1683  if (Outs.empty())
1684    return true;
1685
1686  SmallVector<CCValAssign, 16> ArgLocs;
1687  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1688                 getTargetMachine(), ArgLocs, *DAG.getContext());
1689
1690  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1691
1692  const AArch64MachineFunctionInfo *FuncInfo
1693    = MF.getInfo<AArch64MachineFunctionInfo>();
1694
1695  // If the stack arguments for this call would fit into our own save area then
1696  // the call can be made tail.
1697  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
1698}
1699
1700bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
1701                                                   bool TailCallOpt) const {
1702  return CallCC == CallingConv::Fast && TailCallOpt;
1703}
1704
1705bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
1706  return CallCC == CallingConv::Fast;
1707}
1708
1709SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
1710                                                   SelectionDAG &DAG,
1711                                                   MachineFrameInfo *MFI,
1712                                                   int ClobberedFI) const {
1713  SmallVector<SDValue, 8> ArgChains;
1714  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
1715  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
1716
1717  // Include the original chain at the beginning of the list. When this is
1718  // used by target LowerCall hooks, this helps legalize find the
1719  // CALLSEQ_BEGIN node.
1720  ArgChains.push_back(Chain);
1721
1722  // Add a chain value for each stack argument corresponding
1723  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1724         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
1725    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
1726      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
1727        if (FI->getIndex() < 0) {
1728          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
1729          int64_t InLastByte = InFirstByte;
1730          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
1731
1732          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1733              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1734            ArgChains.push_back(SDValue(L, 1));
1735        }
1736
1737   // Build a tokenfactor for all the chains.
1738   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
1739                      &ArgChains[0], ArgChains.size());
1740}
1741
1742static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
1743  switch (CC) {
1744  case ISD::SETEQ:  return A64CC::EQ;
1745  case ISD::SETGT:  return A64CC::GT;
1746  case ISD::SETGE:  return A64CC::GE;
1747  case ISD::SETLT:  return A64CC::LT;
1748  case ISD::SETLE:  return A64CC::LE;
1749  case ISD::SETNE:  return A64CC::NE;
1750  case ISD::SETUGT: return A64CC::HI;
1751  case ISD::SETUGE: return A64CC::HS;
1752  case ISD::SETULT: return A64CC::LO;
1753  case ISD::SETULE: return A64CC::LS;
1754  default: llvm_unreachable("Unexpected condition code");
1755  }
1756}
1757
1758bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
1759  // icmp is implemented using adds/subs immediate, which take an unsigned
1760  // 12-bit immediate, optionally shifted left by 12 bits.
1761
1762  // Symmetric by using adds/subs
1763  if (Val < 0)
1764    Val = -Val;
1765
1766  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
1767}
1768
1769SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
1770                                        ISD::CondCode CC, SDValue &A64cc,
1771                                        SelectionDAG &DAG, SDLoc &dl) const {
1772  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1773    int64_t C = 0;
1774    EVT VT = RHSC->getValueType(0);
1775    bool knownInvalid = false;
1776
1777    // I'm not convinced the rest of LLVM handles these edge cases properly, but
1778    // we can at least get it right.
1779    if (isSignedIntSetCC(CC)) {
1780      C = RHSC->getSExtValue();
1781    } else if (RHSC->getZExtValue() > INT64_MAX) {
1782      // A 64-bit constant not representable by a signed 64-bit integer is far
1783      // too big to fit into a SUBS immediate anyway.
1784      knownInvalid = true;
1785    } else {
1786      C = RHSC->getZExtValue();
1787    }
1788
1789    if (!knownInvalid && !isLegalICmpImmediate(C)) {
1790      // Constant does not fit, try adjusting it by one?
1791      switch (CC) {
1792      default: break;
1793      case ISD::SETLT:
1794      case ISD::SETGE:
1795        if (isLegalICmpImmediate(C-1)) {
1796          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1797          RHS = DAG.getConstant(C-1, VT);
1798        }
1799        break;
1800      case ISD::SETULT:
1801      case ISD::SETUGE:
1802        if (isLegalICmpImmediate(C-1)) {
1803          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1804          RHS = DAG.getConstant(C-1, VT);
1805        }
1806        break;
1807      case ISD::SETLE:
1808      case ISD::SETGT:
1809        if (isLegalICmpImmediate(C+1)) {
1810          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1811          RHS = DAG.getConstant(C+1, VT);
1812        }
1813        break;
1814      case ISD::SETULE:
1815      case ISD::SETUGT:
1816        if (isLegalICmpImmediate(C+1)) {
1817          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1818          RHS = DAG.getConstant(C+1, VT);
1819        }
1820        break;
1821      }
1822    }
1823  }
1824
1825  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
1826  A64cc = DAG.getConstant(CondCode, MVT::i32);
1827  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
1828                     DAG.getCondCode(CC));
1829}
1830
1831static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
1832                                    A64CC::CondCodes &Alternative) {
1833  A64CC::CondCodes CondCode = A64CC::Invalid;
1834  Alternative = A64CC::Invalid;
1835
1836  switch (CC) {
1837  default: llvm_unreachable("Unknown FP condition!");
1838  case ISD::SETEQ:
1839  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
1840  case ISD::SETGT:
1841  case ISD::SETOGT: CondCode = A64CC::GT; break;
1842  case ISD::SETGE:
1843  case ISD::SETOGE: CondCode = A64CC::GE; break;
1844  case ISD::SETOLT: CondCode = A64CC::MI; break;
1845  case ISD::SETOLE: CondCode = A64CC::LS; break;
1846  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
1847  case ISD::SETO:   CondCode = A64CC::VC; break;
1848  case ISD::SETUO:  CondCode = A64CC::VS; break;
1849  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
1850  case ISD::SETUGT: CondCode = A64CC::HI; break;
1851  case ISD::SETUGE: CondCode = A64CC::PL; break;
1852  case ISD::SETLT:
1853  case ISD::SETULT: CondCode = A64CC::LT; break;
1854  case ISD::SETLE:
1855  case ISD::SETULE: CondCode = A64CC::LE; break;
1856  case ISD::SETNE:
1857  case ISD::SETUNE: CondCode = A64CC::NE; break;
1858  }
1859  return CondCode;
1860}
1861
1862SDValue
1863AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
1864  SDLoc DL(Op);
1865  EVT PtrVT = getPointerTy();
1866  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
1867
1868  switch(getTargetMachine().getCodeModel()) {
1869  case CodeModel::Small:
1870    // The most efficient code is PC-relative anyway for the small memory model,
1871    // so we don't need to worry about relocation model.
1872    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
1873                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
1874                                                 AArch64II::MO_NO_FLAG),
1875                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
1876                                                 AArch64II::MO_LO12),
1877                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
1878  case CodeModel::Large:
1879    return DAG.getNode(
1880      AArch64ISD::WrapperLarge, DL, PtrVT,
1881      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
1882      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
1883      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
1884      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
1885  default:
1886    llvm_unreachable("Only small and large code models supported now");
1887  }
1888}
1889
1890
1891// (BRCOND chain, val, dest)
1892SDValue
1893AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1894  SDLoc dl(Op);
1895  SDValue Chain = Op.getOperand(0);
1896  SDValue TheBit = Op.getOperand(1);
1897  SDValue DestBB = Op.getOperand(2);
1898
1899  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
1900  // that as the consumer we are responsible for ignoring rubbish in higher
1901  // bits.
1902  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
1903                       DAG.getConstant(1, MVT::i32));
1904
1905  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
1906                               DAG.getConstant(0, TheBit.getValueType()),
1907                               DAG.getCondCode(ISD::SETNE));
1908
1909  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
1910                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
1911                     DestBB);
1912}
1913
1914// (BR_CC chain, condcode, lhs, rhs, dest)
1915SDValue
1916AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
1917  SDLoc dl(Op);
1918  SDValue Chain = Op.getOperand(0);
1919  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
1920  SDValue LHS = Op.getOperand(2);
1921  SDValue RHS = Op.getOperand(3);
1922  SDValue DestBB = Op.getOperand(4);
1923
1924  if (LHS.getValueType() == MVT::f128) {
1925    // f128 comparisons are lowered to runtime calls by a routine which sets
1926    // LHS, RHS and CC appropriately for the rest of this function to continue.
1927    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
1928
1929    // If softenSetCCOperands returned a scalar, we need to compare the result
1930    // against zero to select between true and false values.
1931    if (RHS.getNode() == 0) {
1932      RHS = DAG.getConstant(0, LHS.getValueType());
1933      CC = ISD::SETNE;
1934    }
1935  }
1936
1937  if (LHS.getValueType().isInteger()) {
1938    SDValue A64cc;
1939
1940    // Integers are handled in a separate function because the combinations of
1941    // immediates and tests can get hairy and we may want to fiddle things.
1942    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
1943
1944    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1945                       Chain, CmpOp, A64cc, DestBB);
1946  }
1947
1948  // Note that some LLVM floating-point CondCodes can't be lowered to a single
1949  // conditional branch, hence FPCCToA64CC can set a second test, where either
1950  // passing is sufficient.
1951  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
1952  CondCode = FPCCToA64CC(CC, Alternative);
1953  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
1954  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
1955                              DAG.getCondCode(CC));
1956  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1957                                 Chain, SetCC, A64cc, DestBB);
1958
1959  if (Alternative != A64CC::Invalid) {
1960    A64cc = DAG.getConstant(Alternative, MVT::i32);
1961    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1962                           A64BR_CC, SetCC, A64cc, DestBB);
1963
1964  }
1965
1966  return A64BR_CC;
1967}
1968
1969SDValue
1970AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
1971                                       RTLIB::Libcall Call) const {
1972  ArgListTy Args;
1973  ArgListEntry Entry;
1974  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
1975    EVT ArgVT = Op.getOperand(i).getValueType();
1976    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
1977    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
1978    Entry.isSExt = false;
1979    Entry.isZExt = false;
1980    Args.push_back(Entry);
1981  }
1982  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
1983
1984  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
1985
1986  // By default, the input chain to this libcall is the entry node of the
1987  // function. If the libcall is going to be emitted as a tail call then
1988  // isUsedByReturnOnly will change it to the right chain if the return
1989  // node which is being folded has a non-entry input chain.
1990  SDValue InChain = DAG.getEntryNode();
1991
1992  // isTailCall may be true since the callee does not reference caller stack
1993  // frame. Check if it's in the right position.
1994  SDValue TCChain = InChain;
1995  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
1996  if (isTailCall)
1997    InChain = TCChain;
1998
1999  TargetLowering::
2000  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
2001                    0, getLibcallCallingConv(Call), isTailCall,
2002                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
2003                    Callee, Args, DAG, SDLoc(Op));
2004  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
2005
2006  if (!CallInfo.second.getNode())
2007    // It's a tailcall, return the chain (which is the DAG root).
2008    return DAG.getRoot();
2009
2010  return CallInfo.first;
2011}
2012
2013SDValue
2014AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
2015  if (Op.getOperand(0).getValueType() != MVT::f128) {
2016    // It's legal except when f128 is involved
2017    return Op;
2018  }
2019
2020  RTLIB::Libcall LC;
2021  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
2022
2023  SDValue SrcVal = Op.getOperand(0);
2024  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
2025                     /*isSigned*/ false, SDLoc(Op)).first;
2026}
2027
2028SDValue
2029AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
2030  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2031
2032  RTLIB::Libcall LC;
2033  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2034
2035  return LowerF128ToCall(Op, DAG, LC);
2036}
2037
2038SDValue
2039AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2040                                      bool IsSigned) const {
2041  if (Op.getOperand(0).getValueType() != MVT::f128) {
2042    // It's legal except when f128 is involved
2043    return Op;
2044  }
2045
2046  RTLIB::Libcall LC;
2047  if (IsSigned)
2048    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
2049  else
2050    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
2051
2052  return LowerF128ToCall(Op, DAG, LC);
2053}
2054
2055SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
2056  MachineFunction &MF = DAG.getMachineFunction();
2057  MachineFrameInfo *MFI = MF.getFrameInfo();
2058  MFI->setReturnAddressIsTaken(true);
2059
2060  EVT VT = Op.getValueType();
2061  SDLoc dl(Op);
2062  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2063  if (Depth) {
2064    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
2065    SDValue Offset = DAG.getConstant(8, MVT::i64);
2066    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
2067                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
2068                       MachinePointerInfo(), false, false, false, 0);
2069  }
2070
2071  // Return X30, which contains the return address. Mark it an implicit live-in.
2072  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
2073  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
2074}
2075
2076
2077SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
2078                                              const {
2079  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
2080  MFI->setFrameAddressIsTaken(true);
2081
2082  EVT VT = Op.getValueType();
2083  SDLoc dl(Op);
2084  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2085  unsigned FrameReg = AArch64::X29;
2086  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
2087  while (Depth--)
2088    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
2089                            MachinePointerInfo(),
2090                            false, false, false, 0);
2091  return FrameAddr;
2092}
2093
2094SDValue
2095AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
2096                                                  SelectionDAG &DAG) const {
2097  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
2098  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
2099
2100  EVT PtrVT = getPointerTy();
2101  SDLoc dl(Op);
2102  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2103  const GlobalValue *GV = GN->getGlobal();
2104
2105  SDValue GlobalAddr = DAG.getNode(
2106      AArch64ISD::WrapperLarge, dl, PtrVT,
2107      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
2108      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
2109      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
2110      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
2111
2112  if (GN->getOffset() != 0)
2113    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2114                       DAG.getConstant(GN->getOffset(), PtrVT));
2115
2116  return GlobalAddr;
2117}
2118
2119SDValue
2120AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
2121                                                  SelectionDAG &DAG) const {
2122  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
2123
2124  EVT PtrVT = getPointerTy();
2125  SDLoc dl(Op);
2126  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2127  const GlobalValue *GV = GN->getGlobal();
2128  unsigned Alignment = GV->getAlignment();
2129  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2130  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
2131    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
2132    // to zero when they remain undefined. In PIC mode the GOT can take care of
2133    // this, but in absolute mode we use a constant pool load.
2134    SDValue PoolAddr;
2135    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2136                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2137                                                     AArch64II::MO_NO_FLAG),
2138                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2139                                                     AArch64II::MO_LO12),
2140                           DAG.getConstant(8, MVT::i32));
2141    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
2142                                     MachinePointerInfo::getConstantPool(),
2143                                     /*isVolatile=*/ false,
2144                                     /*isNonTemporal=*/ true,
2145                                     /*isInvariant=*/ true, 8);
2146    if (GN->getOffset() != 0)
2147      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2148                         DAG.getConstant(GN->getOffset(), PtrVT));
2149
2150    return GlobalAddr;
2151  }
2152
2153  if (Alignment == 0) {
2154    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
2155    if (GVPtrTy->getElementType()->isSized()) {
2156      Alignment
2157        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
2158    } else {
2159      // Be conservative if we can't guess, not that it really matters:
2160      // functions and labels aren't valid for loads, and the methods used to
2161      // actually calculate an address work with any alignment.
2162      Alignment = 1;
2163    }
2164  }
2165
2166  unsigned char HiFixup, LoFixup;
2167  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
2168
2169  if (UseGOT) {
2170    HiFixup = AArch64II::MO_GOT;
2171    LoFixup = AArch64II::MO_GOT_LO12;
2172    Alignment = 8;
2173  } else {
2174    HiFixup = AArch64II::MO_NO_FLAG;
2175    LoFixup = AArch64II::MO_LO12;
2176  }
2177
2178  // AArch64's small model demands the following sequence:
2179  // ADRP x0, somewhere
2180  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
2181  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2182                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2183                                                             HiFixup),
2184                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2185                                                             LoFixup),
2186                                  DAG.getConstant(Alignment, MVT::i32));
2187
2188  if (UseGOT) {
2189    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
2190                            GlobalRef);
2191  }
2192
2193  if (GN->getOffset() != 0)
2194    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
2195                       DAG.getConstant(GN->getOffset(), PtrVT));
2196
2197  return GlobalRef;
2198}
2199
2200SDValue
2201AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
2202                                             SelectionDAG &DAG) const {
2203  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
2204  // we make those distinctions here.
2205
2206  switch (getTargetMachine().getCodeModel()) {
2207  case CodeModel::Small:
2208    return LowerGlobalAddressELFSmall(Op, DAG);
2209  case CodeModel::Large:
2210    return LowerGlobalAddressELFLarge(Op, DAG);
2211  default:
2212    llvm_unreachable("Only small and large code models supported now");
2213  }
2214}
2215
2216SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
2217                                                SDValue DescAddr,
2218                                                SDLoc DL,
2219                                                SelectionDAG &DAG) const {
2220  EVT PtrVT = getPointerTy();
2221
2222  // The function we need to call is simply the first entry in the GOT for this
2223  // descriptor, load it in preparation.
2224  SDValue Func, Chain;
2225  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2226                     DescAddr);
2227
2228  // The function takes only one argument: the address of the descriptor itself
2229  // in X0.
2230  SDValue Glue;
2231  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
2232  Glue = Chain.getValue(1);
2233
2234  // Finally, there's a special calling-convention which means that the lookup
2235  // must preserve all registers (except X0, obviously).
2236  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
2237  const AArch64RegisterInfo *A64RI
2238    = static_cast<const AArch64RegisterInfo *>(TRI);
2239  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
2240
2241  // We're now ready to populate the argument list, as with a normal call:
2242  std::vector<SDValue> Ops;
2243  Ops.push_back(Chain);
2244  Ops.push_back(Func);
2245  Ops.push_back(SymAddr);
2246  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
2247  Ops.push_back(DAG.getRegisterMask(Mask));
2248  Ops.push_back(Glue);
2249
2250  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2251  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
2252                      Ops.size());
2253  Glue = Chain.getValue(1);
2254
2255  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
2256  // back to the generic handling code.
2257  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
2258}
2259
2260SDValue
2261AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
2262                                             SelectionDAG &DAG) const {
2263  assert(getSubtarget()->isTargetELF() &&
2264         "TLS not implemented for non-ELF targets");
2265  assert(getTargetMachine().getCodeModel() == CodeModel::Small
2266         && "TLS only supported in small memory model");
2267  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2268
2269  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
2270
2271  SDValue TPOff;
2272  EVT PtrVT = getPointerTy();
2273  SDLoc DL(Op);
2274  const GlobalValue *GV = GA->getGlobal();
2275
2276  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
2277
2278  if (Model == TLSModel::InitialExec) {
2279    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2280                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2281                                                   AArch64II::MO_GOTTPREL),
2282                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2283                                                   AArch64II::MO_GOTTPREL_LO12),
2284                        DAG.getConstant(8, MVT::i32));
2285    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2286                        TPOff);
2287  } else if (Model == TLSModel::LocalExec) {
2288    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2289                                               AArch64II::MO_TPREL_G1);
2290    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2291                                               AArch64II::MO_TPREL_G0_NC);
2292
2293    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2294                                       DAG.getTargetConstant(1, MVT::i32)), 0);
2295    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2296                                       TPOff, LoVar,
2297                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2298  } else if (Model == TLSModel::GeneralDynamic) {
2299    // Accesses used in this sequence go via the TLS descriptor which lives in
2300    // the GOT. Prepare an address we can use to handle this.
2301    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2302                                                AArch64II::MO_TLSDESC);
2303    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2304                                                AArch64II::MO_TLSDESC_LO12);
2305    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2306                                   HiDesc, LoDesc,
2307                                   DAG.getConstant(8, MVT::i32));
2308    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
2309
2310    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2311  } else if (Model == TLSModel::LocalDynamic) {
2312    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
2313    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
2314    // the beginning of the module's TLS region, followed by a DTPREL offset
2315    // calculation.
2316
2317    // These accesses will need deduplicating if there's more than one.
2318    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
2319      .getInfo<AArch64MachineFunctionInfo>();
2320    MFI->incNumLocalDynamicTLSAccesses();
2321
2322
2323    // Get the location of _TLS_MODULE_BASE_:
2324    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2325                                                AArch64II::MO_TLSDESC);
2326    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2327                                                AArch64II::MO_TLSDESC_LO12);
2328    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2329                                   HiDesc, LoDesc,
2330                                   DAG.getConstant(8, MVT::i32));
2331    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
2332
2333    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2334
2335    // Get the variable's offset from _TLS_MODULE_BASE_
2336    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2337                                               AArch64II::MO_DTPREL_G1);
2338    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2339                                               AArch64II::MO_DTPREL_G0_NC);
2340
2341    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2342                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2343    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2344                                       TPOff, LoVar,
2345                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2346  } else
2347      llvm_unreachable("Unsupported TLS access model");
2348
2349
2350  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
2351}
2352
2353SDValue
2354AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2355                                      bool IsSigned) const {
2356  if (Op.getValueType() != MVT::f128) {
2357    // Legal for everything except f128.
2358    return Op;
2359  }
2360
2361  RTLIB::Libcall LC;
2362  if (IsSigned)
2363    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2364  else
2365    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2366
2367  return LowerF128ToCall(Op, DAG, LC);
2368}
2369
2370
2371SDValue
2372AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
2373  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
2374  SDLoc dl(JT);
2375  EVT PtrVT = getPointerTy();
2376
2377  // When compiling PIC, jump tables get put in the code section so a static
2378  // relocation-style is acceptable for both cases.
2379  switch (getTargetMachine().getCodeModel()) {
2380  case CodeModel::Small:
2381    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2382                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
2383                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
2384                                              AArch64II::MO_LO12),
2385                       DAG.getConstant(1, MVT::i32));
2386  case CodeModel::Large:
2387    return DAG.getNode(
2388      AArch64ISD::WrapperLarge, dl, PtrVT,
2389      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
2390      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
2391      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
2392      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
2393  default:
2394    llvm_unreachable("Only small and large code models supported now");
2395  }
2396}
2397
2398// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
2399SDValue
2400AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
2401  SDLoc dl(Op);
2402  SDValue LHS = Op.getOperand(0);
2403  SDValue RHS = Op.getOperand(1);
2404  SDValue IfTrue = Op.getOperand(2);
2405  SDValue IfFalse = Op.getOperand(3);
2406  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2407
2408  if (LHS.getValueType() == MVT::f128) {
2409    // f128 comparisons are lowered to libcalls, but slot in nicely here
2410    // afterwards.
2411    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2412
2413    // If softenSetCCOperands returned a scalar, we need to compare the result
2414    // against zero to select between true and false values.
2415    if (RHS.getNode() == 0) {
2416      RHS = DAG.getConstant(0, LHS.getValueType());
2417      CC = ISD::SETNE;
2418    }
2419  }
2420
2421  if (LHS.getValueType().isInteger()) {
2422    SDValue A64cc;
2423
2424    // Integers are handled in a separate function because the combinations of
2425    // immediates and tests can get hairy and we may want to fiddle things.
2426    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2427
2428    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2429                       CmpOp, IfTrue, IfFalse, A64cc);
2430  }
2431
2432  // Note that some LLVM floating-point CondCodes can't be lowered to a single
2433  // conditional branch, hence FPCCToA64CC can set a second test, where either
2434  // passing is sufficient.
2435  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2436  CondCode = FPCCToA64CC(CC, Alternative);
2437  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2438  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2439                              DAG.getCondCode(CC));
2440  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
2441                                     Op.getValueType(),
2442                                     SetCC, IfTrue, IfFalse, A64cc);
2443
2444  if (Alternative != A64CC::Invalid) {
2445    A64cc = DAG.getConstant(Alternative, MVT::i32);
2446    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2447                               SetCC, IfTrue, A64SELECT_CC, A64cc);
2448
2449  }
2450
2451  return A64SELECT_CC;
2452}
2453
2454// (SELECT testbit, iftrue, iffalse)
2455SDValue
2456AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2457  SDLoc dl(Op);
2458  SDValue TheBit = Op.getOperand(0);
2459  SDValue IfTrue = Op.getOperand(1);
2460  SDValue IfFalse = Op.getOperand(2);
2461
2462  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
2463  // that as the consumer we are responsible for ignoring rubbish in higher
2464  // bits.
2465  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
2466                       DAG.getConstant(1, MVT::i32));
2467  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
2468                               DAG.getConstant(0, TheBit.getValueType()),
2469                               DAG.getCondCode(ISD::SETNE));
2470
2471  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2472                     A64CMP, IfTrue, IfFalse,
2473                     DAG.getConstant(A64CC::NE, MVT::i32));
2474}
2475
2476static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
2477  SDLoc DL(Op);
2478  SDValue LHS = Op.getOperand(0);
2479  SDValue RHS = Op.getOperand(1);
2480  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2481  EVT VT = Op.getValueType();
2482  bool Invert = false;
2483  SDValue Op0, Op1;
2484  unsigned Opcode;
2485
2486  if (LHS.getValueType().isInteger()) {
2487
2488    // Attempt to use Vector Integer Compare Mask Test instruction.
2489    // TST = icmp ne (and (op0, op1), zero).
2490    if (CC == ISD::SETNE) {
2491      if (((LHS.getOpcode() == ISD::AND) &&
2492           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
2493          ((RHS.getOpcode() == ISD::AND) &&
2494           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
2495
2496        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
2497        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
2498        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
2499        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
2500      }
2501    }
2502
2503    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
2504    // Note: Compare against Zero does not support unsigned predicates.
2505    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2506         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
2507        !isUnsignedIntSetCC(CC)) {
2508
2509      // If LHS is the zero value, swap operands and CondCode.
2510      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2511        CC = getSetCCSwappedOperands(CC);
2512        Op0 = RHS;
2513      } else
2514        Op0 = LHS;
2515
2516      // Ensure valid CondCode for Compare Mask against Zero instruction:
2517      // EQ, GE, GT, LE, LT.
2518      if (ISD::SETNE == CC) {
2519        Invert = true;
2520        CC = ISD::SETEQ;
2521      }
2522
2523      // Using constant type to differentiate integer and FP compares with zero.
2524      Op1 = DAG.getConstant(0, MVT::i32);
2525      Opcode = AArch64ISD::NEON_CMPZ;
2526
2527    } else {
2528      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
2529      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
2530      bool Swap = false;
2531      switch (CC) {
2532      default:
2533        llvm_unreachable("Illegal integer comparison.");
2534      case ISD::SETEQ:
2535      case ISD::SETGT:
2536      case ISD::SETGE:
2537      case ISD::SETUGT:
2538      case ISD::SETUGE:
2539        break;
2540      case ISD::SETNE:
2541        Invert = true;
2542        CC = ISD::SETEQ;
2543        break;
2544      case ISD::SETULT:
2545      case ISD::SETULE:
2546      case ISD::SETLT:
2547      case ISD::SETLE:
2548        Swap = true;
2549        CC = getSetCCSwappedOperands(CC);
2550      }
2551
2552      if (Swap)
2553        std::swap(LHS, RHS);
2554
2555      Opcode = AArch64ISD::NEON_CMP;
2556      Op0 = LHS;
2557      Op1 = RHS;
2558    }
2559
2560    // Generate Compare Mask instr or Compare Mask against Zero instr.
2561    SDValue NeonCmp =
2562        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2563
2564    if (Invert)
2565      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2566
2567    return NeonCmp;
2568  }
2569
2570  // Now handle Floating Point cases.
2571  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
2572  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2573      ISD::isBuildVectorAllZeros(LHS.getNode())) {
2574
2575    // If LHS is the zero value, swap operands and CondCode.
2576    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2577      CC = getSetCCSwappedOperands(CC);
2578      Op0 = RHS;
2579    } else
2580      Op0 = LHS;
2581
2582    // Using constant type to differentiate integer and FP compares with zero.
2583    Op1 = DAG.getConstantFP(0, MVT::f32);
2584    Opcode = AArch64ISD::NEON_CMPZ;
2585  } else {
2586    // Attempt to use Vector Floating Point Compare Mask instruction.
2587    Op0 = LHS;
2588    Op1 = RHS;
2589    Opcode = AArch64ISD::NEON_CMP;
2590  }
2591
2592  SDValue NeonCmpAlt;
2593  // Some register compares have to be implemented with swapped CC and operands,
2594  // e.g.: OLT implemented as OGT with swapped operands.
2595  bool SwapIfRegArgs = false;
2596
2597  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
2598  // EQ, GE, GT, LE, LT.
2599  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
2600  switch (CC) {
2601  default:
2602    llvm_unreachable("Illegal FP comparison");
2603  case ISD::SETUNE:
2604  case ISD::SETNE:
2605    Invert = true; // Fallthrough
2606  case ISD::SETOEQ:
2607  case ISD::SETEQ:
2608    CC = ISD::SETEQ;
2609    break;
2610  case ISD::SETOLT:
2611  case ISD::SETLT:
2612    CC = ISD::SETLT;
2613    SwapIfRegArgs = true;
2614    break;
2615  case ISD::SETOGT:
2616  case ISD::SETGT:
2617    CC = ISD::SETGT;
2618    break;
2619  case ISD::SETOLE:
2620  case ISD::SETLE:
2621    CC = ISD::SETLE;
2622    SwapIfRegArgs = true;
2623    break;
2624  case ISD::SETOGE:
2625  case ISD::SETGE:
2626    CC = ISD::SETGE;
2627    break;
2628  case ISD::SETUGE:
2629    Invert = true;
2630    CC = ISD::SETLT;
2631    SwapIfRegArgs = true;
2632    break;
2633  case ISD::SETULE:
2634    Invert = true;
2635    CC = ISD::SETGT;
2636    break;
2637  case ISD::SETUGT:
2638    Invert = true;
2639    CC = ISD::SETLE;
2640    SwapIfRegArgs = true;
2641    break;
2642  case ISD::SETULT:
2643    Invert = true;
2644    CC = ISD::SETGE;
2645    break;
2646  case ISD::SETUEQ:
2647    Invert = true; // Fallthrough
2648  case ISD::SETONE:
2649    // Expand this to (OGT |OLT).
2650    NeonCmpAlt =
2651        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
2652    CC = ISD::SETLT;
2653    SwapIfRegArgs = true;
2654    break;
2655  case ISD::SETUO:
2656    Invert = true; // Fallthrough
2657  case ISD::SETO:
2658    // Expand this to (OGE | OLT).
2659    NeonCmpAlt =
2660        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
2661    CC = ISD::SETLT;
2662    SwapIfRegArgs = true;
2663    break;
2664  }
2665
2666  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
2667    CC = getSetCCSwappedOperands(CC);
2668    std::swap(Op0, Op1);
2669  }
2670
2671  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
2672  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2673
2674  if (NeonCmpAlt.getNode())
2675    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
2676
2677  if (Invert)
2678    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2679
2680  return NeonCmp;
2681}
2682
2683// (SETCC lhs, rhs, condcode)
2684SDValue
2685AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
2686  SDLoc dl(Op);
2687  SDValue LHS = Op.getOperand(0);
2688  SDValue RHS = Op.getOperand(1);
2689  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2690  EVT VT = Op.getValueType();
2691
2692  if (VT.isVector())
2693    return LowerVectorSETCC(Op, DAG);
2694
2695  if (LHS.getValueType() == MVT::f128) {
2696    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
2697    // for the rest of the function (some i32 or i64 values).
2698    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2699
2700    // If softenSetCCOperands returned a scalar, use it.
2701    if (RHS.getNode() == 0) {
2702      assert(LHS.getValueType() == Op.getValueType() &&
2703             "Unexpected setcc expansion!");
2704      return LHS;
2705    }
2706  }
2707
2708  if (LHS.getValueType().isInteger()) {
2709    SDValue A64cc;
2710
2711    // Integers are handled in a separate function because the combinations of
2712    // immediates and tests can get hairy and we may want to fiddle things.
2713    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2714
2715    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2716                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
2717                       A64cc);
2718  }
2719
2720  // Note that some LLVM floating-point CondCodes can't be lowered to a single
2721  // conditional branch, hence FPCCToA64CC can set a second test, where either
2722  // passing is sufficient.
2723  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2724  CondCode = FPCCToA64CC(CC, Alternative);
2725  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2726  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2727                              DAG.getCondCode(CC));
2728  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2729                                     CmpOp, DAG.getConstant(1, VT),
2730                                     DAG.getConstant(0, VT), A64cc);
2731
2732  if (Alternative != A64CC::Invalid) {
2733    A64cc = DAG.getConstant(Alternative, MVT::i32);
2734    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
2735                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
2736  }
2737
2738  return A64SELECT_CC;
2739}
2740
2741SDValue
2742AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
2743  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2744  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2745
2746  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
2747  // rather than just 8.
2748  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
2749                       Op.getOperand(1), Op.getOperand(2),
2750                       DAG.getConstant(32, MVT::i32), 8, false, false,
2751                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
2752}
2753
2754SDValue
2755AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2756  // The layout of the va_list struct is specified in the AArch64 Procedure Call
2757  // Standard, section B.3.
2758  MachineFunction &MF = DAG.getMachineFunction();
2759  AArch64MachineFunctionInfo *FuncInfo
2760    = MF.getInfo<AArch64MachineFunctionInfo>();
2761  SDLoc DL(Op);
2762
2763  SDValue Chain = Op.getOperand(0);
2764  SDValue VAList = Op.getOperand(1);
2765  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2766  SmallVector<SDValue, 4> MemOps;
2767
2768  // void *__stack at offset 0
2769  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
2770                                    getPointerTy());
2771  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
2772                                MachinePointerInfo(SV), false, false, 0));
2773
2774  // void *__gr_top at offset 8
2775  int GPRSize = FuncInfo->getVariadicGPRSize();
2776  if (GPRSize > 0) {
2777    SDValue GRTop, GRTopAddr;
2778
2779    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2780                            DAG.getConstant(8, getPointerTy()));
2781
2782    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
2783    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
2784                        DAG.getConstant(GPRSize, getPointerTy()));
2785
2786    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
2787                                  MachinePointerInfo(SV, 8),
2788                                  false, false, 0));
2789  }
2790
2791  // void *__vr_top at offset 16
2792  int FPRSize = FuncInfo->getVariadicFPRSize();
2793  if (FPRSize > 0) {
2794    SDValue VRTop, VRTopAddr;
2795    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2796                            DAG.getConstant(16, getPointerTy()));
2797
2798    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
2799    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
2800                        DAG.getConstant(FPRSize, getPointerTy()));
2801
2802    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
2803                                  MachinePointerInfo(SV, 16),
2804                                  false, false, 0));
2805  }
2806
2807  // int __gr_offs at offset 24
2808  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2809                                   DAG.getConstant(24, getPointerTy()));
2810  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
2811                                GROffsAddr, MachinePointerInfo(SV, 24),
2812                                false, false, 0));
2813
2814  // int __vr_offs at offset 28
2815  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2816                                   DAG.getConstant(28, getPointerTy()));
2817  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
2818                                VROffsAddr, MachinePointerInfo(SV, 28),
2819                                false, false, 0));
2820
2821  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
2822                     MemOps.size());
2823}
2824
2825SDValue
2826AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2827  switch (Op.getOpcode()) {
2828  default: llvm_unreachable("Don't know how to custom lower this!");
2829  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
2830  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
2831  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
2832  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
2833  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
2834  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
2835  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
2836  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
2837  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
2838  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
2839  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
2840  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
2841
2842  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
2843  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
2844  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
2845  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
2846  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
2847  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
2848  case ISD::SELECT: return LowerSELECT(Op, DAG);
2849  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
2850  case ISD::SETCC: return LowerSETCC(Op, DAG);
2851  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
2852  case ISD::VASTART: return LowerVASTART(Op, DAG);
2853  case ISD::BUILD_VECTOR:
2854    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
2855  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
2856  }
2857
2858  return SDValue();
2859}
2860
2861/// Check if the specified splat value corresponds to a valid vector constant
2862/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
2863/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
2864/// values.
2865static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
2866                              unsigned SplatBitSize, SelectionDAG &DAG,
2867                              bool is128Bits, NeonModImmType type, EVT &VT,
2868                              unsigned &Imm, unsigned &OpCmode) {
2869  switch (SplatBitSize) {
2870  default:
2871    llvm_unreachable("unexpected size for isNeonModifiedImm");
2872  case 8: {
2873    if (type != Neon_Mov_Imm)
2874      return false;
2875    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
2876    // Neon movi per byte: Op=0, Cmode=1110.
2877    OpCmode = 0xe;
2878    Imm = SplatBits;
2879    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
2880    break;
2881  }
2882  case 16: {
2883    // Neon move inst per halfword
2884    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
2885    if ((SplatBits & ~0xff) == 0) {
2886      // Value = 0x00nn is 0x00nn LSL 0
2887      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
2888      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
2889      // Op=x, Cmode=100y
2890      Imm = SplatBits;
2891      OpCmode = 0x8;
2892      break;
2893    }
2894    if ((SplatBits & ~0xff00) == 0) {
2895      // Value = 0xnn00 is 0x00nn LSL 8
2896      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
2897      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
2898      // Op=x, Cmode=101x
2899      Imm = SplatBits >> 8;
2900      OpCmode = 0xa;
2901      break;
2902    }
2903    // can't handle any other
2904    return false;
2905  }
2906
2907  case 32: {
2908    // First the LSL variants (MSL is unusable by some interested instructions).
2909
2910    // Neon move instr per word, shift zeros
2911    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
2912    if ((SplatBits & ~0xff) == 0) {
2913      // Value = 0x000000nn is 0x000000nn LSL 0
2914      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
2915      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
2916      // Op=x, Cmode=000x
2917      Imm = SplatBits;
2918      OpCmode = 0;
2919      break;
2920    }
2921    if ((SplatBits & ~0xff00) == 0) {
2922      // Value = 0x0000nn00 is 0x000000nn LSL 8
2923      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
2924      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
2925      // Op=x, Cmode=001x
2926      Imm = SplatBits >> 8;
2927      OpCmode = 0x2;
2928      break;
2929    }
2930    if ((SplatBits & ~0xff0000) == 0) {
2931      // Value = 0x00nn0000 is 0x000000nn LSL 16
2932      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
2933      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
2934      // Op=x, Cmode=010x
2935      Imm = SplatBits >> 16;
2936      OpCmode = 0x4;
2937      break;
2938    }
2939    if ((SplatBits & ~0xff000000) == 0) {
2940      // Value = 0xnn000000 is 0x000000nn LSL 24
2941      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
2942      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
2943      // Op=x, Cmode=011x
2944      Imm = SplatBits >> 24;
2945      OpCmode = 0x6;
2946      break;
2947    }
2948
2949    // Now the MSL immediates.
2950
2951    // Neon move instr per word, shift ones
2952    if ((SplatBits & ~0xffff) == 0 &&
2953        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
2954      // Value = 0x0000nnff is 0x000000nn MSL 8
2955      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
2956      // Op=x, Cmode=1100
2957      Imm = SplatBits >> 8;
2958      OpCmode = 0xc;
2959      break;
2960    }
2961    if ((SplatBits & ~0xffffff) == 0 &&
2962        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
2963      // Value = 0x00nnffff is 0x000000nn MSL 16
2964      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
2965      // Op=x, Cmode=1101
2966      Imm = SplatBits >> 16;
2967      OpCmode = 0xd;
2968      break;
2969    }
2970    // can't handle any other
2971    return false;
2972  }
2973
2974  case 64: {
2975    if (type != Neon_Mov_Imm)
2976      return false;
2977    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
2978    // movi Op=1, Cmode=1110.
2979    OpCmode = 0x1e;
2980    uint64_t BitMask = 0xff;
2981    uint64_t Val = 0;
2982    unsigned ImmMask = 1;
2983    Imm = 0;
2984    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
2985      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
2986        Val |= BitMask;
2987        Imm |= ImmMask;
2988      } else if ((SplatBits & BitMask) != 0) {
2989        return false;
2990      }
2991      BitMask <<= 8;
2992      ImmMask <<= 1;
2993    }
2994    SplatBits = Val;
2995    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
2996    break;
2997  }
2998  }
2999
3000  return true;
3001}
3002
3003static SDValue PerformANDCombine(SDNode *N,
3004                                 TargetLowering::DAGCombinerInfo &DCI) {
3005
3006  SelectionDAG &DAG = DCI.DAG;
3007  SDLoc DL(N);
3008  EVT VT = N->getValueType(0);
3009
3010  // We're looking for an SRA/SHL pair which form an SBFX.
3011
3012  if (VT != MVT::i32 && VT != MVT::i64)
3013    return SDValue();
3014
3015  if (!isa<ConstantSDNode>(N->getOperand(1)))
3016    return SDValue();
3017
3018  uint64_t TruncMask = N->getConstantOperandVal(1);
3019  if (!isMask_64(TruncMask))
3020    return SDValue();
3021
3022  uint64_t Width = CountPopulation_64(TruncMask);
3023  SDValue Shift = N->getOperand(0);
3024
3025  if (Shift.getOpcode() != ISD::SRL)
3026    return SDValue();
3027
3028  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3029    return SDValue();
3030  uint64_t LSB = Shift->getConstantOperandVal(1);
3031
3032  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3033    return SDValue();
3034
3035  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
3036                     DAG.getConstant(LSB, MVT::i64),
3037                     DAG.getConstant(LSB + Width - 1, MVT::i64));
3038}
3039
3040/// For a true bitfield insert, the bits getting into that contiguous mask
3041/// should come from the low part of an existing value: they must be formed from
3042/// a compatible SHL operation (unless they're already low). This function
3043/// checks that condition and returns the least-significant bit that's
3044/// intended. If the operation not a field preparation, -1 is returned.
3045static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
3046                            SDValue &MaskedVal, uint64_t Mask) {
3047  if (!isShiftedMask_64(Mask))
3048    return -1;
3049
3050  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
3051  // instruction. BFI will do a left-shift by LSB before applying the mask we've
3052  // spotted, so in general we should pre-emptively "undo" that by making sure
3053  // the incoming bits have had a right-shift applied to them.
3054  //
3055  // This right shift, however, will combine with existing left/right shifts. In
3056  // the simplest case of a completely straight bitfield operation, it will be
3057  // expected to completely cancel out with an existing SHL. More complicated
3058  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
3059  // the BFI.
3060
3061  uint64_t LSB = countTrailingZeros(Mask);
3062  int64_t ShiftRightRequired = LSB;
3063  if (MaskedVal.getOpcode() == ISD::SHL &&
3064      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3065    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
3066    MaskedVal = MaskedVal.getOperand(0);
3067  } else if (MaskedVal.getOpcode() == ISD::SRL &&
3068             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3069    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
3070    MaskedVal = MaskedVal.getOperand(0);
3071  }
3072
3073  if (ShiftRightRequired > 0)
3074    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
3075                            DAG.getConstant(ShiftRightRequired, MVT::i64));
3076  else if (ShiftRightRequired < 0) {
3077    // We could actually end up with a residual left shift, for example with
3078    // "struc.bitfield = val << 1".
3079    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
3080                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
3081  }
3082
3083  return LSB;
3084}
3085
3086/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
3087/// a mask and an extension. Returns true if a BFI was found and provides
3088/// information on its surroundings.
3089static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
3090                          bool &Extended) {
3091  Extended = false;
3092  if (N.getOpcode() == ISD::ZERO_EXTEND) {
3093    Extended = true;
3094    N = N.getOperand(0);
3095  }
3096
3097  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
3098    Mask = N->getConstantOperandVal(1);
3099    N = N.getOperand(0);
3100  } else {
3101    // Mask is the whole width.
3102    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
3103  }
3104
3105  if (N.getOpcode() == AArch64ISD::BFI) {
3106    BFI = N;
3107    return true;
3108  }
3109
3110  return false;
3111}
3112
3113/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
3114/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
3115/// can often be further combined with a larger mask. Ultimately, we want mask
3116/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
3117static SDValue tryCombineToBFI(SDNode *N,
3118                               TargetLowering::DAGCombinerInfo &DCI,
3119                               const AArch64Subtarget *Subtarget) {
3120  SelectionDAG &DAG = DCI.DAG;
3121  SDLoc DL(N);
3122  EVT VT = N->getValueType(0);
3123
3124  assert(N->getOpcode() == ISD::OR && "Unexpected root");
3125
3126  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
3127  // abandon the effort.
3128  SDValue LHS = N->getOperand(0);
3129  if (LHS.getOpcode() != ISD::AND)
3130    return SDValue();
3131
3132  uint64_t LHSMask;
3133  if (isa<ConstantSDNode>(LHS.getOperand(1)))
3134    LHSMask = LHS->getConstantOperandVal(1);
3135  else
3136    return SDValue();
3137
3138  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
3139  // is or abandon the effort.
3140  SDValue RHS = N->getOperand(1);
3141  if (RHS.getOpcode() != ISD::AND)
3142    return SDValue();
3143
3144  uint64_t RHSMask;
3145  if (isa<ConstantSDNode>(RHS.getOperand(1)))
3146    RHSMask = RHS->getConstantOperandVal(1);
3147  else
3148    return SDValue();
3149
3150  // Can't do anything if the masks are incompatible.
3151  if (LHSMask & RHSMask)
3152    return SDValue();
3153
3154  // Now we need one of the masks to be a contiguous field. Without loss of
3155  // generality that should be the RHS one.
3156  SDValue Bitfield = LHS.getOperand(0);
3157  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
3158    // We know that LHS is a candidate new value, and RHS isn't already a better
3159    // one.
3160    std::swap(LHS, RHS);
3161    std::swap(LHSMask, RHSMask);
3162  }
3163
3164  // We've done our best to put the right operands in the right places, all we
3165  // can do now is check whether a BFI exists.
3166  Bitfield = RHS.getOperand(0);
3167  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
3168  if (LSB == -1)
3169    return SDValue();
3170
3171  uint32_t Width = CountPopulation_64(RHSMask);
3172  assert(Width && "Expected non-zero bitfield width");
3173
3174  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3175                            LHS.getOperand(0), Bitfield,
3176                            DAG.getConstant(LSB, MVT::i64),
3177                            DAG.getConstant(Width, MVT::i64));
3178
3179  // Mask is trivial
3180  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3181    return BFI;
3182
3183  return DAG.getNode(ISD::AND, DL, VT, BFI,
3184                     DAG.getConstant(LHSMask | RHSMask, VT));
3185}
3186
3187/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
3188/// original input. This is surprisingly common because SROA splits things up
3189/// into i8 chunks, so the originally detected MaskedBFI may actually only act
3190/// on the low (say) byte of a word. This is then orred into the rest of the
3191/// word afterwards.
3192///
3193/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
3194///
3195/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
3196/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
3197/// involved.
3198static SDValue tryCombineToLargerBFI(SDNode *N,
3199                                     TargetLowering::DAGCombinerInfo &DCI,
3200                                     const AArch64Subtarget *Subtarget) {
3201  SelectionDAG &DAG = DCI.DAG;
3202  SDLoc DL(N);
3203  EVT VT = N->getValueType(0);
3204
3205  // First job is to hunt for a MaskedBFI on either the left or right. Swap
3206  // operands if it's actually on the right.
3207  SDValue BFI;
3208  SDValue PossExtraMask;
3209  uint64_t ExistingMask = 0;
3210  bool Extended = false;
3211  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
3212    PossExtraMask = N->getOperand(1);
3213  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
3214    PossExtraMask = N->getOperand(0);
3215  else
3216    return SDValue();
3217
3218  // We can only combine a BFI with another compatible mask.
3219  if (PossExtraMask.getOpcode() != ISD::AND ||
3220      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
3221    return SDValue();
3222
3223  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
3224
3225  // Masks must be compatible.
3226  if (ExtraMask & ExistingMask)
3227    return SDValue();
3228
3229  SDValue OldBFIVal = BFI.getOperand(0);
3230  SDValue NewBFIVal = BFI.getOperand(1);
3231  if (Extended) {
3232    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
3233    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
3234    // need to be made compatible.
3235    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
3236           && "Invalid types for BFI");
3237    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
3238    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
3239  }
3240
3241  // We need the MaskedBFI to be combined with a mask of the *same* value.
3242  if (PossExtraMask.getOperand(0) != OldBFIVal)
3243    return SDValue();
3244
3245  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3246                    OldBFIVal, NewBFIVal,
3247                    BFI.getOperand(2), BFI.getOperand(3));
3248
3249  // If the masking is trivial, we don't need to create it.
3250  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3251    return BFI;
3252
3253  return DAG.getNode(ISD::AND, DL, VT, BFI,
3254                     DAG.getConstant(ExtraMask | ExistingMask, VT));
3255}
3256
3257/// An EXTR instruction is made up of two shifts, ORed together. This helper
3258/// searches for and classifies those shifts.
3259static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
3260                         bool &FromHi) {
3261  if (N.getOpcode() == ISD::SHL)
3262    FromHi = false;
3263  else if (N.getOpcode() == ISD::SRL)
3264    FromHi = true;
3265  else
3266    return false;
3267
3268  if (!isa<ConstantSDNode>(N.getOperand(1)))
3269    return false;
3270
3271  ShiftAmount = N->getConstantOperandVal(1);
3272  Src = N->getOperand(0);
3273  return true;
3274}
3275
3276/// EXTR instruction extracts a contiguous chunk of bits from two existing
3277/// registers viewed as a high/low pair. This function looks for the pattern:
3278/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
3279/// EXTR. Can't quite be done in TableGen because the two immediates aren't
3280/// independent.
3281static SDValue tryCombineToEXTR(SDNode *N,
3282                                TargetLowering::DAGCombinerInfo &DCI) {
3283  SelectionDAG &DAG = DCI.DAG;
3284  SDLoc DL(N);
3285  EVT VT = N->getValueType(0);
3286
3287  assert(N->getOpcode() == ISD::OR && "Unexpected root");
3288
3289  if (VT != MVT::i32 && VT != MVT::i64)
3290    return SDValue();
3291
3292  SDValue LHS;
3293  uint32_t ShiftLHS = 0;
3294  bool LHSFromHi = 0;
3295  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
3296    return SDValue();
3297
3298  SDValue RHS;
3299  uint32_t ShiftRHS = 0;
3300  bool RHSFromHi = 0;
3301  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
3302    return SDValue();
3303
3304  // If they're both trying to come from the high part of the register, they're
3305  // not really an EXTR.
3306  if (LHSFromHi == RHSFromHi)
3307    return SDValue();
3308
3309  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
3310    return SDValue();
3311
3312  if (LHSFromHi) {
3313    std::swap(LHS, RHS);
3314    std::swap(ShiftLHS, ShiftRHS);
3315  }
3316
3317  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
3318                     LHS, RHS,
3319                     DAG.getConstant(ShiftRHS, MVT::i64));
3320}
3321
3322/// Target-specific dag combine xforms for ISD::OR
3323static SDValue PerformORCombine(SDNode *N,
3324                                TargetLowering::DAGCombinerInfo &DCI,
3325                                const AArch64Subtarget *Subtarget) {
3326
3327  SelectionDAG &DAG = DCI.DAG;
3328  SDLoc DL(N);
3329  EVT VT = N->getValueType(0);
3330
3331  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3332    return SDValue();
3333
3334  // Attempt to recognise bitfield-insert operations.
3335  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
3336  if (Res.getNode())
3337    return Res;
3338
3339  // Attempt to combine an existing MaskedBFI operation into one with a larger
3340  // mask.
3341  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
3342  if (Res.getNode())
3343    return Res;
3344
3345  Res = tryCombineToEXTR(N, DCI);
3346  if (Res.getNode())
3347    return Res;
3348
3349  if (!Subtarget->hasNEON())
3350    return SDValue();
3351
3352  // Attempt to use vector immediate-form BSL
3353  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
3354
3355  SDValue N0 = N->getOperand(0);
3356  if (N0.getOpcode() != ISD::AND)
3357    return SDValue();
3358
3359  SDValue N1 = N->getOperand(1);
3360  if (N1.getOpcode() != ISD::AND)
3361    return SDValue();
3362
3363  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
3364    APInt SplatUndef;
3365    unsigned SplatBitSize;
3366    bool HasAnyUndefs;
3367    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
3368    APInt SplatBits0;
3369    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
3370                                      HasAnyUndefs) &&
3371        !HasAnyUndefs) {
3372      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
3373      APInt SplatBits1;
3374      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
3375                                        HasAnyUndefs) &&
3376          !HasAnyUndefs && SplatBits0 == ~SplatBits1) {
3377        // Canonicalize the vector type to make instruction selection simpler.
3378        EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8;
3379        SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT,
3380                                     N0->getOperand(1), N0->getOperand(0),
3381                                     N1->getOperand(0));
3382        return DAG.getNode(ISD::BITCAST, DL, VT, Result);
3383      }
3384    }
3385  }
3386
3387  return SDValue();
3388}
3389
3390/// Target-specific dag combine xforms for ISD::SRA
3391static SDValue PerformSRACombine(SDNode *N,
3392                                 TargetLowering::DAGCombinerInfo &DCI) {
3393
3394  SelectionDAG &DAG = DCI.DAG;
3395  SDLoc DL(N);
3396  EVT VT = N->getValueType(0);
3397
3398  // We're looking for an SRA/SHL pair which form an SBFX.
3399
3400  if (VT != MVT::i32 && VT != MVT::i64)
3401    return SDValue();
3402
3403  if (!isa<ConstantSDNode>(N->getOperand(1)))
3404    return SDValue();
3405
3406  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
3407  SDValue Shift = N->getOperand(0);
3408
3409  if (Shift.getOpcode() != ISD::SHL)
3410    return SDValue();
3411
3412  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3413    return SDValue();
3414
3415  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
3416  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
3417  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
3418
3419  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3420    return SDValue();
3421
3422  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
3423                     DAG.getConstant(LSB, MVT::i64),
3424                     DAG.getConstant(LSB + Width - 1, MVT::i64));
3425}
3426
3427/// Check if this is a valid build_vector for the immediate operand of
3428/// a vector shift operation, where all the elements of the build_vector
3429/// must have the same constant integer value.
3430static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
3431  // Ignore bit_converts.
3432  while (Op.getOpcode() == ISD::BITCAST)
3433    Op = Op.getOperand(0);
3434  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
3435  APInt SplatBits, SplatUndef;
3436  unsigned SplatBitSize;
3437  bool HasAnyUndefs;
3438  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
3439                                      HasAnyUndefs, ElementBits) ||
3440      SplatBitSize > ElementBits)
3441    return false;
3442  Cnt = SplatBits.getSExtValue();
3443  return true;
3444}
3445
3446/// Check if this is a valid build_vector for the immediate operand of
3447/// a vector shift left operation.  That value must be in the range:
3448/// 0 <= Value < ElementBits
3449static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
3450  assert(VT.isVector() && "vector shift count is not a vector type");
3451  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3452  if (!getVShiftImm(Op, ElementBits, Cnt))
3453    return false;
3454  return (Cnt >= 0 && Cnt < ElementBits);
3455}
3456
3457/// Check if this is a valid build_vector for the immediate operand of a
3458/// vector shift right operation. The value must be in the range:
3459///   1 <= Value <= ElementBits
3460static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
3461  assert(VT.isVector() && "vector shift count is not a vector type");
3462  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3463  if (!getVShiftImm(Op, ElementBits, Cnt))
3464    return false;
3465  return (Cnt >= 1 && Cnt <= ElementBits);
3466}
3467
3468/// Checks for immediate versions of vector shifts and lowers them.
3469static SDValue PerformShiftCombine(SDNode *N,
3470                                   TargetLowering::DAGCombinerInfo &DCI,
3471                                   const AArch64Subtarget *ST) {
3472  SelectionDAG &DAG = DCI.DAG;
3473  EVT VT = N->getValueType(0);
3474  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
3475    return PerformSRACombine(N, DCI);
3476
3477  // Nothing to be done for scalar shifts.
3478  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3479  if (!VT.isVector() || !TLI.isTypeLegal(VT))
3480    return SDValue();
3481
3482  assert(ST->hasNEON() && "unexpected vector shift");
3483  int64_t Cnt;
3484
3485  switch (N->getOpcode()) {
3486  default:
3487    llvm_unreachable("unexpected shift opcode");
3488
3489  case ISD::SHL:
3490    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
3491      SDValue RHS =
3492          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3493                      DAG.getConstant(Cnt, MVT::i32));
3494      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
3495    }
3496    break;
3497
3498  case ISD::SRA:
3499  case ISD::SRL:
3500    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
3501      SDValue RHS =
3502          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3503                      DAG.getConstant(Cnt, MVT::i32));
3504      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
3505    }
3506    break;
3507  }
3508
3509  return SDValue();
3510}
3511
3512/// ARM-specific DAG combining for intrinsics.
3513static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
3514  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3515
3516  switch (IntNo) {
3517  default:
3518    // Don't do anything for most intrinsics.
3519    break;
3520
3521  case Intrinsic::arm_neon_vqshifts:
3522  case Intrinsic::arm_neon_vqshiftu:
3523    EVT VT = N->getOperand(1).getValueType();
3524    int64_t Cnt;
3525    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
3526      break;
3527    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
3528                             ? AArch64ISD::NEON_QSHLs
3529                             : AArch64ISD::NEON_QSHLu;
3530    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
3531                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
3532  }
3533
3534  return SDValue();
3535}
3536
3537/// Target-specific DAG combine function for NEON load/store intrinsics
3538/// to merge base address updates.
3539static SDValue CombineBaseUpdate(SDNode *N,
3540                                 TargetLowering::DAGCombinerInfo &DCI) {
3541  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
3542    return SDValue();
3543
3544  SelectionDAG &DAG = DCI.DAG;
3545  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
3546                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
3547  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
3548  SDValue Addr = N->getOperand(AddrOpIdx);
3549
3550  // Search for a use of the address operand that is an increment.
3551  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
3552       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
3553    SDNode *User = *UI;
3554    if (User->getOpcode() != ISD::ADD ||
3555        UI.getUse().getResNo() != Addr.getResNo())
3556      continue;
3557
3558    // Check that the add is independent of the load/store.  Otherwise, folding
3559    // it would create a cycle.
3560    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
3561      continue;
3562
3563    // Find the new opcode for the updating load/store.
3564    bool isLoad = true;
3565    bool isLaneOp = false;
3566    unsigned NewOpc = 0;
3567    unsigned NumVecs = 0;
3568    if (isIntrinsic) {
3569      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3570      switch (IntNo) {
3571      default: llvm_unreachable("unexpected intrinsic for Neon base update");
3572      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
3573        NumVecs = 1; break;
3574      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
3575        NumVecs = 2; break;
3576      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
3577        NumVecs = 3; break;
3578      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
3579        NumVecs = 4; break;
3580      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
3581        NumVecs = 1; isLoad = false; break;
3582      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
3583        NumVecs = 2; isLoad = false; break;
3584      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
3585        NumVecs = 3; isLoad = false; break;
3586      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
3587        NumVecs = 4; isLoad = false; break;
3588      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
3589        NumVecs = 2; break;
3590      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
3591        NumVecs = 3; break;
3592      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
3593        NumVecs = 4; break;
3594      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
3595        NumVecs = 2; isLoad = false; break;
3596      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
3597        NumVecs = 3; isLoad = false; break;
3598      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
3599        NumVecs = 4; isLoad = false; break;
3600      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
3601        NumVecs = 2; isLaneOp = true; break;
3602      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
3603        NumVecs = 3; isLaneOp = true; break;
3604      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
3605        NumVecs = 4; isLaneOp = true; break;
3606      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
3607        NumVecs = 2; isLoad = false; isLaneOp = true; break;
3608      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
3609        NumVecs = 3; isLoad = false; isLaneOp = true; break;
3610      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
3611        NumVecs = 4; isLoad = false; isLaneOp = true; break;
3612      }
3613    } else {
3614      isLaneOp = true;
3615      switch (N->getOpcode()) {
3616      default: llvm_unreachable("unexpected opcode for Neon base update");
3617      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
3618        NumVecs = 2; break;
3619      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
3620        NumVecs = 3; break;
3621      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
3622        NumVecs = 4; break;
3623      }
3624    }
3625
3626    // Find the size of memory referenced by the load/store.
3627    EVT VecTy;
3628    if (isLoad)
3629      VecTy = N->getValueType(0);
3630    else
3631      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
3632    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
3633    if (isLaneOp)
3634      NumBytes /= VecTy.getVectorNumElements();
3635
3636    // If the increment is a constant, it must match the memory ref size.
3637    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
3638    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
3639      uint32_t IncVal = CInc->getZExtValue();
3640      if (IncVal != NumBytes)
3641        continue;
3642      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
3643    }
3644
3645    // Create the new updating load/store node.
3646    EVT Tys[6];
3647    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
3648    unsigned n;
3649    for (n = 0; n < NumResultVecs; ++n)
3650      Tys[n] = VecTy;
3651    Tys[n++] = MVT::i64;
3652    Tys[n] = MVT::Other;
3653    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
3654    SmallVector<SDValue, 8> Ops;
3655    Ops.push_back(N->getOperand(0)); // incoming chain
3656    Ops.push_back(N->getOperand(AddrOpIdx));
3657    Ops.push_back(Inc);
3658    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
3659      Ops.push_back(N->getOperand(i));
3660    }
3661    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
3662    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
3663                                           Ops.data(), Ops.size(),
3664                                           MemInt->getMemoryVT(),
3665                                           MemInt->getMemOperand());
3666
3667    // Update the uses.
3668    std::vector<SDValue> NewResults;
3669    for (unsigned i = 0; i < NumResultVecs; ++i) {
3670      NewResults.push_back(SDValue(UpdN.getNode(), i));
3671    }
3672    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
3673    DCI.CombineTo(N, NewResults);
3674    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
3675
3676    break;
3677  }
3678  return SDValue();
3679}
3680
3681/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
3682/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
3683/// If so, combine them to a vldN-dup operation and return true.
3684static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
3685  SelectionDAG &DAG = DCI.DAG;
3686  EVT VT = N->getValueType(0);
3687
3688  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
3689  SDNode *VLD = N->getOperand(0).getNode();
3690  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
3691    return SDValue();
3692  unsigned NumVecs = 0;
3693  unsigned NewOpc = 0;
3694  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
3695  if (IntNo == Intrinsic::arm_neon_vld2lane) {
3696    NumVecs = 2;
3697    NewOpc = AArch64ISD::NEON_LD2DUP;
3698  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
3699    NumVecs = 3;
3700    NewOpc = AArch64ISD::NEON_LD3DUP;
3701  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
3702    NumVecs = 4;
3703    NewOpc = AArch64ISD::NEON_LD4DUP;
3704  } else {
3705    return SDValue();
3706  }
3707
3708  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
3709  // numbers match the load.
3710  unsigned VLDLaneNo =
3711      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
3712  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3713       UI != UE; ++UI) {
3714    // Ignore uses of the chain result.
3715    if (UI.getUse().getResNo() == NumVecs)
3716      continue;
3717    SDNode *User = *UI;
3718    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
3719        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
3720      return SDValue();
3721  }
3722
3723  // Create the vldN-dup node.
3724  EVT Tys[5];
3725  unsigned n;
3726  for (n = 0; n < NumVecs; ++n)
3727    Tys[n] = VT;
3728  Tys[n] = MVT::Other;
3729  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
3730  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
3731  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
3732  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
3733                                           VLDMemInt->getMemoryVT(),
3734                                           VLDMemInt->getMemOperand());
3735
3736  // Update the uses.
3737  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3738       UI != UE; ++UI) {
3739    unsigned ResNo = UI.getUse().getResNo();
3740    // Ignore uses of the chain result.
3741    if (ResNo == NumVecs)
3742      continue;
3743    SDNode *User = *UI;
3744    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
3745  }
3746
3747  // Now the vldN-lane intrinsic is dead except for its chain result.
3748  // Update uses of the chain.
3749  std::vector<SDValue> VLDDupResults;
3750  for (unsigned n = 0; n < NumVecs; ++n)
3751    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
3752  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
3753  DCI.CombineTo(VLD, VLDDupResults);
3754
3755  return SDValue(N, 0);
3756}
3757
3758SDValue
3759AArch64TargetLowering::PerformDAGCombine(SDNode *N,
3760                                         DAGCombinerInfo &DCI) const {
3761  switch (N->getOpcode()) {
3762  default: break;
3763  case ISD::AND: return PerformANDCombine(N, DCI);
3764  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
3765  case ISD::SHL:
3766  case ISD::SRA:
3767  case ISD::SRL:
3768    return PerformShiftCombine(N, DCI, getSubtarget());
3769  case ISD::INTRINSIC_WO_CHAIN:
3770    return PerformIntrinsicCombine(N, DCI.DAG);
3771  case AArch64ISD::NEON_VDUPLANE:
3772    return CombineVLDDUP(N, DCI);
3773  case AArch64ISD::NEON_LD2DUP:
3774  case AArch64ISD::NEON_LD3DUP:
3775  case AArch64ISD::NEON_LD4DUP:
3776    return CombineBaseUpdate(N, DCI);
3777  case ISD::INTRINSIC_VOID:
3778  case ISD::INTRINSIC_W_CHAIN:
3779    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
3780    case Intrinsic::arm_neon_vld1:
3781    case Intrinsic::arm_neon_vld2:
3782    case Intrinsic::arm_neon_vld3:
3783    case Intrinsic::arm_neon_vld4:
3784    case Intrinsic::arm_neon_vst1:
3785    case Intrinsic::arm_neon_vst2:
3786    case Intrinsic::arm_neon_vst3:
3787    case Intrinsic::arm_neon_vst4:
3788    case Intrinsic::arm_neon_vld2lane:
3789    case Intrinsic::arm_neon_vld3lane:
3790    case Intrinsic::arm_neon_vld4lane:
3791    case Intrinsic::aarch64_neon_vld1x2:
3792    case Intrinsic::aarch64_neon_vld1x3:
3793    case Intrinsic::aarch64_neon_vld1x4:
3794    case Intrinsic::aarch64_neon_vst1x2:
3795    case Intrinsic::aarch64_neon_vst1x3:
3796    case Intrinsic::aarch64_neon_vst1x4:
3797    case Intrinsic::arm_neon_vst2lane:
3798    case Intrinsic::arm_neon_vst3lane:
3799    case Intrinsic::arm_neon_vst4lane:
3800      return CombineBaseUpdate(N, DCI);
3801    default:
3802      break;
3803    }
3804  }
3805  return SDValue();
3806}
3807
3808bool
3809AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3810  VT = VT.getScalarType();
3811
3812  if (!VT.isSimple())
3813    return false;
3814
3815  switch (VT.getSimpleVT().SimpleTy) {
3816  case MVT::f16:
3817  case MVT::f32:
3818  case MVT::f64:
3819    return true;
3820  case MVT::f128:
3821    return false;
3822  default:
3823    break;
3824  }
3825
3826  return false;
3827}
3828
3829// If this is a case we can't handle, return null and let the default
3830// expansion code take care of it.
3831SDValue
3832AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
3833                                         const AArch64Subtarget *ST) const {
3834
3835  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
3836  SDLoc DL(Op);
3837  EVT VT = Op.getValueType();
3838
3839  APInt SplatBits, SplatUndef;
3840  unsigned SplatBitSize;
3841  bool HasAnyUndefs;
3842
3843  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
3844
3845  // Note we favor lowering MOVI over MVNI.
3846  // This has implications on the definition of patterns in TableGen to select
3847  // BIC immediate instructions but not ORR immediate instructions.
3848  // If this lowering order is changed, TableGen patterns for BIC immediate and
3849  // ORR immediate instructions have to be updated.
3850  if (UseNeonMov &&
3851      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
3852    if (SplatBitSize <= 64) {
3853      // First attempt to use vector immediate-form MOVI
3854      EVT NeonMovVT;
3855      unsigned Imm = 0;
3856      unsigned OpCmode = 0;
3857
3858      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
3859                            SplatBitSize, DAG, VT.is128BitVector(),
3860                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
3861        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
3862        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
3863
3864        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
3865          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
3866                                        ImmVal, OpCmodeVal);
3867          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
3868        }
3869      }
3870
3871      // Then attempt to use vector immediate-form MVNI
3872      uint64_t NegatedImm = (~SplatBits).getZExtValue();
3873      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
3874                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
3875                            Imm, OpCmode)) {
3876        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
3877        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
3878        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
3879          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
3880                                        ImmVal, OpCmodeVal);
3881          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
3882        }
3883      }
3884
3885      // Attempt to use vector immediate-form FMOV
3886      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
3887          (VT == MVT::v2f64 && SplatBitSize == 64)) {
3888        APFloat RealVal(
3889            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
3890            SplatBits);
3891        uint32_t ImmVal;
3892        if (A64Imms::isFPImm(RealVal, ImmVal)) {
3893          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
3894          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
3895        }
3896      }
3897    }
3898  }
3899
3900  unsigned NumElts = VT.getVectorNumElements();
3901  bool isOnlyLowElement = true;
3902  bool usesOnlyOneValue = true;
3903  bool hasDominantValue = false;
3904  bool isConstant = true;
3905
3906  // Map of the number of times a particular SDValue appears in the
3907  // element list.
3908  DenseMap<SDValue, unsigned> ValueCounts;
3909  SDValue Value;
3910  for (unsigned i = 0; i < NumElts; ++i) {
3911    SDValue V = Op.getOperand(i);
3912    if (V.getOpcode() == ISD::UNDEF)
3913      continue;
3914    if (i > 0)
3915      isOnlyLowElement = false;
3916    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
3917      isConstant = false;
3918
3919    ValueCounts.insert(std::make_pair(V, 0));
3920    unsigned &Count = ValueCounts[V];
3921
3922    // Is this value dominant? (takes up more than half of the lanes)
3923    if (++Count > (NumElts / 2)) {
3924      hasDominantValue = true;
3925      Value = V;
3926    }
3927  }
3928  if (ValueCounts.size() != 1)
3929    usesOnlyOneValue = false;
3930  if (!Value.getNode() && ValueCounts.size() > 0)
3931    Value = ValueCounts.begin()->first;
3932
3933  if (ValueCounts.size() == 0)
3934    return DAG.getUNDEF(VT);
3935
3936  // Loads are better lowered with insert_vector_elt.
3937  // Keep going if we are hitting this case.
3938  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
3939    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
3940
3941  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
3942  // Use VDUP for non-constant splats.
3943  if (hasDominantValue && EltSize <= 64) {
3944    if (!isConstant) {
3945      SDValue N;
3946
3947      // If we are DUPing a value that comes directly from a vector, we could
3948      // just use DUPLANE. We can only do this if the lane being extracted
3949      // is at a constant index, as the DUP from lane instructions only have
3950      // constant-index forms.
3951      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
3952          isa<ConstantSDNode>(Value->getOperand(1))) {
3953          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
3954                        Value->getOperand(0), Value->getOperand(1));
3955      } else
3956        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
3957
3958      if (!usesOnlyOneValue) {
3959        // The dominant value was splatted as 'N', but we now have to insert
3960        // all differing elements.
3961        for (unsigned I = 0; I < NumElts; ++I) {
3962          if (Op.getOperand(I) == Value)
3963            continue;
3964          SmallVector<SDValue, 3> Ops;
3965          Ops.push_back(N);
3966          Ops.push_back(Op.getOperand(I));
3967          Ops.push_back(DAG.getConstant(I, MVT::i32));
3968          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
3969        }
3970      }
3971      return N;
3972    }
3973    if (usesOnlyOneValue && isConstant) {
3974      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
3975    }
3976  }
3977  // If all elements are constants and the case above didn't get hit, fall back
3978  // to the default expansion, which will generate a load from the constant
3979  // pool.
3980  if (isConstant)
3981    return SDValue();
3982
3983  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
3984  // know the default expansion would otherwise fall back on something even
3985  // worse. For a vector with one or two non-undef values, that's
3986  // scalar_to_vector for the elements followed by a shuffle (provided the
3987  // shuffle is valid for the target) and materialization element by element
3988  // on the stack followed by a load for everything else.
3989  if (!isConstant && !usesOnlyOneValue) {
3990    SDValue Vec = DAG.getUNDEF(VT);
3991    for (unsigned i = 0 ; i < NumElts; ++i) {
3992      SDValue V = Op.getOperand(i);
3993      if (V.getOpcode() == ISD::UNDEF)
3994        continue;
3995      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
3996      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
3997    }
3998    return Vec;
3999  }
4000  return SDValue();
4001}
4002
4003/// isREVMask - Check if a vector shuffle corresponds to a REV
4004/// instruction with the specified blocksize.  (The order of the elements
4005/// within each block of the vector is reversed.)
4006static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4007  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
4008         "Only possible block sizes for REV are: 16, 32, 64");
4009
4010  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4011  if (EltSz == 64)
4012    return false;
4013
4014  unsigned NumElts = VT.getVectorNumElements();
4015  unsigned BlockElts = M[0] + 1;
4016  // If the first shuffle index is UNDEF, be optimistic.
4017  if (M[0] < 0)
4018    BlockElts = BlockSize / EltSz;
4019
4020  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4021    return false;
4022
4023  for (unsigned i = 0; i < NumElts; ++i) {
4024    if (M[i] < 0)
4025      continue; // ignore UNDEF indices
4026    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
4027      return false;
4028  }
4029
4030  return true;
4031}
4032
4033SDValue
4034AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
4035                                           SelectionDAG &DAG) const {
4036  SDValue V1 = Op.getOperand(0);
4037  SDValue V2 = Op.getOperand(1);
4038  SDLoc dl(Op);
4039  EVT VT = Op.getValueType();
4040  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4041
4042  // Convert shuffles that are directly supported on NEON to target-specific
4043  // DAG nodes, instead of keeping them as shuffles and matching them again
4044  // during code selection.  This is more efficient and avoids the possibility
4045  // of inconsistencies between legalization and selection.
4046  ArrayRef<int> ShuffleMask = SVN->getMask();
4047
4048  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4049  if (EltSize > 64)
4050    return SDValue();
4051
4052  if (isREVMask(ShuffleMask, VT, 64))
4053    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
4054  if (isREVMask(ShuffleMask, VT, 32))
4055    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
4056  if (isREVMask(ShuffleMask, VT, 16))
4057    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
4058
4059  // If the element of shuffle mask are all the same constant, we can
4060  // transform it into either NEON_VDUP or NEON_VDUPLANE
4061  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4062    int Lane = SVN->getSplatIndex();
4063    // If this is undef splat, generate it via "just" vdup, if possible.
4064    if (Lane == -1) Lane = 0;
4065
4066    // Test if V1 is a SCALAR_TO_VECTOR.
4067    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4068      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
4069    }
4070    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
4071    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
4072      bool IsScalarToVector = true;
4073      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
4074        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
4075            i != (unsigned)Lane) {
4076          IsScalarToVector = false;
4077          break;
4078        }
4079      if (IsScalarToVector)
4080        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
4081                           V1.getOperand(Lane));
4082    }
4083    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
4084                       DAG.getConstant(Lane, MVT::i64));
4085  }
4086
4087  int Length = ShuffleMask.size();
4088  int V1EltNum = V1.getValueType().getVectorNumElements();
4089
4090  // If the number of v1 elements is the same as the number of shuffle mask
4091  // element and the shuffle masks are sequential values, we can transform
4092  // it into NEON_VEXTRACT.
4093  if (V1EltNum == Length) {
4094    // Check if the shuffle mask is sequential.
4095    bool IsSequential = true;
4096    int CurMask = ShuffleMask[0];
4097    for (int I = 0; I < Length; ++I) {
4098      if (ShuffleMask[I] != CurMask) {
4099        IsSequential = false;
4100        break;
4101      }
4102      CurMask++;
4103    }
4104    if (IsSequential) {
4105      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
4106      unsigned VecSize = EltSize * V1EltNum;
4107      unsigned Index = (EltSize/8) * ShuffleMask[0];
4108      if (VecSize == 64 || VecSize == 128)
4109        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
4110                           DAG.getConstant(Index, MVT::i64));
4111    }
4112  }
4113
4114  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
4115  // by element from V2 to V1 .
4116  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
4117  // better choice to be inserted than V1 as less insert needed, so we count
4118  // element to be inserted for both V1 and V2, and select less one as insert
4119  // target.
4120
4121  // Collect elements need to be inserted and their index.
4122  SmallVector<int, 8> NV1Elt;
4123  SmallVector<int, 8> N1Index;
4124  SmallVector<int, 8> NV2Elt;
4125  SmallVector<int, 8> N2Index;
4126  for (int I = 0; I != Length; ++I) {
4127    if (ShuffleMask[I] != I) {
4128      NV1Elt.push_back(ShuffleMask[I]);
4129      N1Index.push_back(I);
4130    }
4131  }
4132  for (int I = 0; I != Length; ++I) {
4133    if (ShuffleMask[I] != (I + V1EltNum)) {
4134      NV2Elt.push_back(ShuffleMask[I]);
4135      N2Index.push_back(I);
4136    }
4137  }
4138
4139  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
4140  // will be inserted.
4141  SDValue InsV = V1;
4142  SmallVector<int, 8> InsMasks = NV1Elt;
4143  SmallVector<int, 8> InsIndex = N1Index;
4144  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
4145    if (NV1Elt.size() > NV2Elt.size()) {
4146      InsV = V2;
4147      InsMasks = NV2Elt;
4148      InsIndex = N2Index;
4149    }
4150  } else {
4151    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
4152  }
4153
4154  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
4155    SDValue ExtV = V1;
4156    int Mask = InsMasks[I];
4157    if (Mask >= V1EltNum) {
4158      ExtV = V2;
4159      Mask -= V1EltNum;
4160    }
4161    // Any value type smaller than i32 is illegal in AArch64, and this lower
4162    // function is called after legalize pass, so we need to legalize
4163    // the result here.
4164    EVT EltVT;
4165    if (VT.getVectorElementType().isFloatingPoint())
4166      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
4167    else
4168      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
4169
4170    ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
4171                        DAG.getConstant(Mask, MVT::i64));
4172    InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
4173                       DAG.getConstant(InsIndex[I], MVT::i64));
4174  }
4175  return InsV;
4176}
4177
4178AArch64TargetLowering::ConstraintType
4179AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
4180  if (Constraint.size() == 1) {
4181    switch (Constraint[0]) {
4182    default: break;
4183    case 'w': // An FP/SIMD vector register
4184      return C_RegisterClass;
4185    case 'I': // Constant that can be used with an ADD instruction
4186    case 'J': // Constant that can be used with a SUB instruction
4187    case 'K': // Constant that can be used with a 32-bit logical instruction
4188    case 'L': // Constant that can be used with a 64-bit logical instruction
4189    case 'M': // Constant that can be used as a 32-bit MOV immediate
4190    case 'N': // Constant that can be used as a 64-bit MOV immediate
4191    case 'Y': // Floating point constant zero
4192    case 'Z': // Integer constant zero
4193      return C_Other;
4194    case 'Q': // A memory reference with base register and no offset
4195      return C_Memory;
4196    case 'S': // A symbolic address
4197      return C_Other;
4198    }
4199  }
4200
4201  // FIXME: Ump, Utf, Usa, Ush
4202  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
4203  //      whatever they may be
4204  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
4205  // Usa: An absolute symbolic address
4206  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
4207  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
4208         && Constraint != "Ush" && "Unimplemented constraints");
4209
4210  return TargetLowering::getConstraintType(Constraint);
4211}
4212
4213TargetLowering::ConstraintWeight
4214AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
4215                                                const char *Constraint) const {
4216
4217  llvm_unreachable("Constraint weight unimplemented");
4218}
4219
4220void
4221AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
4222                                                    std::string &Constraint,
4223                                                    std::vector<SDValue> &Ops,
4224                                                    SelectionDAG &DAG) const {
4225  SDValue Result(0, 0);
4226
4227  // Only length 1 constraints are C_Other.
4228  if (Constraint.size() != 1) return;
4229
4230  // Only C_Other constraints get lowered like this. That means constants for us
4231  // so return early if there's no hope the constraint can be lowered.
4232
4233  switch(Constraint[0]) {
4234  default: break;
4235  case 'I': case 'J': case 'K': case 'L':
4236  case 'M': case 'N': case 'Z': {
4237    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
4238    if (!C)
4239      return;
4240
4241    uint64_t CVal = C->getZExtValue();
4242    uint32_t Bits;
4243
4244    switch (Constraint[0]) {
4245    default:
4246      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
4247      // is a peculiarly useless SUB constraint.
4248      llvm_unreachable("Unimplemented C_Other constraint");
4249    case 'I':
4250      if (CVal <= 0xfff)
4251        break;
4252      return;
4253    case 'K':
4254      if (A64Imms::isLogicalImm(32, CVal, Bits))
4255        break;
4256      return;
4257    case 'L':
4258      if (A64Imms::isLogicalImm(64, CVal, Bits))
4259        break;
4260      return;
4261    case 'Z':
4262      if (CVal == 0)
4263        break;
4264      return;
4265    }
4266
4267    Result = DAG.getTargetConstant(CVal, Op.getValueType());
4268    break;
4269  }
4270  case 'S': {
4271    // An absolute symbolic address or label reference.
4272    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
4273      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
4274                                          GA->getValueType(0));
4275    } else if (const BlockAddressSDNode *BA
4276                 = dyn_cast<BlockAddressSDNode>(Op)) {
4277      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
4278                                         BA->getValueType(0));
4279    } else if (const ExternalSymbolSDNode *ES
4280                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
4281      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
4282                                           ES->getValueType(0));
4283    } else
4284      return;
4285    break;
4286  }
4287  case 'Y':
4288    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
4289      if (CFP->isExactlyValue(0.0)) {
4290        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
4291        break;
4292      }
4293    }
4294    return;
4295  }
4296
4297  if (Result.getNode()) {
4298    Ops.push_back(Result);
4299    return;
4300  }
4301
4302  // It's an unknown constraint for us. Let generic code have a go.
4303  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
4304}
4305
4306std::pair<unsigned, const TargetRegisterClass*>
4307AArch64TargetLowering::getRegForInlineAsmConstraint(
4308                                                  const std::string &Constraint,
4309                                                  MVT VT) const {
4310  if (Constraint.size() == 1) {
4311    switch (Constraint[0]) {
4312    case 'r':
4313      if (VT.getSizeInBits() <= 32)
4314        return std::make_pair(0U, &AArch64::GPR32RegClass);
4315      else if (VT == MVT::i64)
4316        return std::make_pair(0U, &AArch64::GPR64RegClass);
4317      break;
4318    case 'w':
4319      if (VT == MVT::f16)
4320        return std::make_pair(0U, &AArch64::FPR16RegClass);
4321      else if (VT == MVT::f32)
4322        return std::make_pair(0U, &AArch64::FPR32RegClass);
4323      else if (VT.getSizeInBits() == 64)
4324        return std::make_pair(0U, &AArch64::FPR64RegClass);
4325      else if (VT.getSizeInBits() == 128)
4326        return std::make_pair(0U, &AArch64::FPR128RegClass);
4327      break;
4328    }
4329  }
4330
4331  // Use the default implementation in TargetLowering to convert the register
4332  // constraint into a member of a register class.
4333  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
4334}
4335
4336/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
4337/// The associated MachineMemOperands record the alignment specified
4338/// in the intrinsic calls.
4339bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4340                                               const CallInst &I,
4341                                               unsigned Intrinsic) const {
4342  switch (Intrinsic) {
4343  case Intrinsic::arm_neon_vld1:
4344  case Intrinsic::arm_neon_vld2:
4345  case Intrinsic::arm_neon_vld3:
4346  case Intrinsic::arm_neon_vld4:
4347  case Intrinsic::aarch64_neon_vld1x2:
4348  case Intrinsic::aarch64_neon_vld1x3:
4349  case Intrinsic::aarch64_neon_vld1x4:
4350  case Intrinsic::arm_neon_vld2lane:
4351  case Intrinsic::arm_neon_vld3lane:
4352  case Intrinsic::arm_neon_vld4lane: {
4353    Info.opc = ISD::INTRINSIC_W_CHAIN;
4354    // Conservatively set memVT to the entire set of vectors loaded.
4355    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
4356    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4357    Info.ptrVal = I.getArgOperand(0);
4358    Info.offset = 0;
4359    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4360    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4361    Info.vol = false; // volatile loads with NEON intrinsics not supported
4362    Info.readMem = true;
4363    Info.writeMem = false;
4364    return true;
4365  }
4366  case Intrinsic::arm_neon_vst1:
4367  case Intrinsic::arm_neon_vst2:
4368  case Intrinsic::arm_neon_vst3:
4369  case Intrinsic::arm_neon_vst4:
4370  case Intrinsic::aarch64_neon_vst1x2:
4371  case Intrinsic::aarch64_neon_vst1x3:
4372  case Intrinsic::aarch64_neon_vst1x4:
4373  case Intrinsic::arm_neon_vst2lane:
4374  case Intrinsic::arm_neon_vst3lane:
4375  case Intrinsic::arm_neon_vst4lane: {
4376    Info.opc = ISD::INTRINSIC_VOID;
4377    // Conservatively set memVT to the entire set of vectors stored.
4378    unsigned NumElts = 0;
4379    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
4380      Type *ArgTy = I.getArgOperand(ArgI)->getType();
4381      if (!ArgTy->isVectorTy())
4382        break;
4383      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
4384    }
4385    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4386    Info.ptrVal = I.getArgOperand(0);
4387    Info.offset = 0;
4388    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4389    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4390    Info.vol = false; // volatile stores with NEON intrinsics not supported
4391    Info.readMem = false;
4392    Info.writeMem = true;
4393    return true;
4394  }
4395  default:
4396    break;
4397  }
4398
4399  return false;
4400}
4401