AArch64ISelLowering.cpp revision 9584d3222fa54f7419d008c41d49b4b44331c51c
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that AArch64 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "aarch64-isel"
16#include "AArch64.h"
17#include "AArch64ISelLowering.h"
18#include "AArch64MachineFunctionInfo.h"
19#include "AArch64TargetMachine.h"
20#include "AArch64TargetObjectFile.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
28#include "llvm/IR/CallingConv.h"
29
30using namespace llvm;
31
32static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
33  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
34
35  if (Subtarget->isTargetLinux())
36    return new AArch64LinuxTargetObjectFile();
37  if (Subtarget->isTargetELF())
38    return new TargetLoweringObjectFileELF();
39  llvm_unreachable("unknown subtarget type");
40}
41
42AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
43  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
44
45  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
46
47  // SIMD compares set the entire lane's bits to 1
48  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
49
50  // Scalar register <-> type mapping
51  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
52  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
53
54  if (Subtarget->hasFPARMv8()) {
55    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
56    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
57    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
58    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
59  }
60
61  if (Subtarget->hasNEON()) {
62    // And the vectors
63    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
64    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
65    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
66    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
67    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
68    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
69    addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass);
70    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
71    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
72    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
73    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
74    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
75    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
76    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
77    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
78    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
79    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
80  }
81
82  computeRegisterProperties();
83
84  // We combine OR nodes for bitfield and NEON BSL operations.
85  setTargetDAGCombine(ISD::OR);
86
87  setTargetDAGCombine(ISD::AND);
88  setTargetDAGCombine(ISD::SRA);
89  setTargetDAGCombine(ISD::SRL);
90  setTargetDAGCombine(ISD::SHL);
91
92  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
93  setTargetDAGCombine(ISD::INTRINSIC_VOID);
94  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
95
96  // AArch64 does not have i1 loads, or much of anything for i1 really.
97  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
98  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
99  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
100
101  setStackPointerRegisterToSaveRestore(AArch64::XSP);
102  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
103  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
104  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
105
106  // We'll lower globals to wrappers for selection.
107  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
108  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
109
110  // A64 instructions have the comparison predicate attached to the user of the
111  // result, but having a separate comparison is valuable for matching.
112  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
113  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
114  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
115  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
116
117  setOperationAction(ISD::SELECT, MVT::i32, Custom);
118  setOperationAction(ISD::SELECT, MVT::i64, Custom);
119  setOperationAction(ISD::SELECT, MVT::f32, Custom);
120  setOperationAction(ISD::SELECT, MVT::f64, Custom);
121
122  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
123  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
124  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
125  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
126
127  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
128
129  setOperationAction(ISD::SETCC, MVT::i32, Custom);
130  setOperationAction(ISD::SETCC, MVT::i64, Custom);
131  setOperationAction(ISD::SETCC, MVT::f32, Custom);
132  setOperationAction(ISD::SETCC, MVT::f64, Custom);
133
134  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
135  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
136  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
137
138  setOperationAction(ISD::VASTART, MVT::Other, Custom);
139  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
140  setOperationAction(ISD::VAEND, MVT::Other, Expand);
141  setOperationAction(ISD::VAARG, MVT::Other, Expand);
142
143  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
144
145  setOperationAction(ISD::ROTL, MVT::i32, Expand);
146  setOperationAction(ISD::ROTL, MVT::i64, Expand);
147
148  setOperationAction(ISD::UREM, MVT::i32, Expand);
149  setOperationAction(ISD::UREM, MVT::i64, Expand);
150  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
151  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
152
153  setOperationAction(ISD::SREM, MVT::i32, Expand);
154  setOperationAction(ISD::SREM, MVT::i64, Expand);
155  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
156  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
157
158  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
159  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
160
161  // Legal floating-point operations.
162  setOperationAction(ISD::FABS, MVT::f32, Legal);
163  setOperationAction(ISD::FABS, MVT::f64, Legal);
164
165  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
166  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
167
168  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
169  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
170
171  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
172  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
173
174  setOperationAction(ISD::FNEG, MVT::f32, Legal);
175  setOperationAction(ISD::FNEG, MVT::f64, Legal);
176
177  setOperationAction(ISD::FRINT, MVT::f32, Legal);
178  setOperationAction(ISD::FRINT, MVT::f64, Legal);
179
180  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
181  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
182
183  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
184  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
185
186  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
187  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
188  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
189
190  // Illegal floating-point operations.
191  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
192  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
193
194  setOperationAction(ISD::FCOS, MVT::f32, Expand);
195  setOperationAction(ISD::FCOS, MVT::f64, Expand);
196
197  setOperationAction(ISD::FEXP, MVT::f32, Expand);
198  setOperationAction(ISD::FEXP, MVT::f64, Expand);
199
200  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
201  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
202
203  setOperationAction(ISD::FLOG, MVT::f32, Expand);
204  setOperationAction(ISD::FLOG, MVT::f64, Expand);
205
206  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
207  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
208
209  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
210  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
211
212  setOperationAction(ISD::FPOW, MVT::f32, Expand);
213  setOperationAction(ISD::FPOW, MVT::f64, Expand);
214
215  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
216  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
217
218  setOperationAction(ISD::FREM, MVT::f32, Expand);
219  setOperationAction(ISD::FREM, MVT::f64, Expand);
220
221  setOperationAction(ISD::FSIN, MVT::f32, Expand);
222  setOperationAction(ISD::FSIN, MVT::f64, Expand);
223
224  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
225  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
226
227  // Virtually no operation on f128 is legal, but LLVM can't expand them when
228  // there's a valid register class, so we need custom operations in most cases.
229  setOperationAction(ISD::FABS,       MVT::f128, Expand);
230  setOperationAction(ISD::FADD,       MVT::f128, Custom);
231  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
232  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
233  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
234  setOperationAction(ISD::FMA,        MVT::f128, Expand);
235  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
236  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
237  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
238  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
239  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
240  setOperationAction(ISD::FREM,       MVT::f128, Expand);
241  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
242  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
243  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
244  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
245  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
246  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
247  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
248  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
249  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
250  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
251  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
252
253  // Lowering for many of the conversions is actually specified by the non-f128
254  // type. The LowerXXX function will be trivial when f128 isn't involved.
255  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
256  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
257  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
258  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
259  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
260  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
261  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
262  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
263  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
264  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
265  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
266  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
267  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
268  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
269
270  // This prevents LLVM trying to compress double constants into a floating
271  // constant-pool entry and trying to load from there. It's of doubtful benefit
272  // for A64: we'd need LDR followed by FCVT, I believe.
273  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
274  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
275  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
276
277  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
278  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
279  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
280  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
281  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
282  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
283
284  setExceptionPointerRegister(AArch64::X0);
285  setExceptionSelectorRegister(AArch64::X1);
286
287  if (Subtarget->hasNEON()) {
288    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
289    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
290    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
291    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
292    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
293    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
294    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
295    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
296    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
297    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
298    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
299    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
300    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
301    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
302    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
303    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
304
305    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
306    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
307    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
308    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
309    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
310    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
311    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
312    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
313    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
314    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
315    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
316    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
317
318    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
319    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
320    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
321    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
322    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
323    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
324    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
325    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
326    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
327
328    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
329    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
330    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
331    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
332    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
333    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
334    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
335    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
336    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
337    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
338    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
339    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
340    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
341
342    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
343    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
344    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
345
346    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
347    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
348    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
349
350    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
351    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
352    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
353
354    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
355    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
356    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
357
358    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
359    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
360    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
361
362    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
363    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
364    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
365  }
366}
367
368EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
369  // It's reasonably important that this value matches the "natural" legal
370  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
371  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
372  if (!VT.isVector()) return MVT::i32;
373  return VT.changeVectorElementTypeToInteger();
374}
375
376static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
377                                  unsigned &LdrOpc,
378                                  unsigned &StrOpc) {
379  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
380                                       AArch64::LDXR_word, AArch64::LDXR_dword};
381  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
382                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
383  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
384                                       AArch64::STXR_word, AArch64::STXR_dword};
385  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
386                                     AArch64::STLXR_word, AArch64::STLXR_dword};
387
388  const unsigned *LoadOps, *StoreOps;
389  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
390    LoadOps = LoadAcqs;
391  else
392    LoadOps = LoadBares;
393
394  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
395    StoreOps = StoreRels;
396  else
397    StoreOps = StoreBares;
398
399  assert(isPowerOf2_32(Size) && Size <= 8 &&
400         "unsupported size for atomic binary op!");
401
402  LdrOpc = LoadOps[Log2_32(Size)];
403  StrOpc = StoreOps[Log2_32(Size)];
404}
405
406MachineBasicBlock *
407AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
408                                        unsigned Size,
409                                        unsigned BinOpcode) const {
410  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
411  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
412
413  const BasicBlock *LLVM_BB = BB->getBasicBlock();
414  MachineFunction *MF = BB->getParent();
415  MachineFunction::iterator It = BB;
416  ++It;
417
418  unsigned dest = MI->getOperand(0).getReg();
419  unsigned ptr = MI->getOperand(1).getReg();
420  unsigned incr = MI->getOperand(2).getReg();
421  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
422  DebugLoc dl = MI->getDebugLoc();
423
424  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
425
426  unsigned ldrOpc, strOpc;
427  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
428
429  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
430  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
431  MF->insert(It, loopMBB);
432  MF->insert(It, exitMBB);
433
434  // Transfer the remainder of BB and its successor edges to exitMBB.
435  exitMBB->splice(exitMBB->begin(), BB,
436                  llvm::next(MachineBasicBlock::iterator(MI)),
437                  BB->end());
438  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
439
440  const TargetRegisterClass *TRC
441    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
442  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
443
444  //  thisMBB:
445  //   ...
446  //   fallthrough --> loopMBB
447  BB->addSuccessor(loopMBB);
448
449  //  loopMBB:
450  //   ldxr dest, ptr
451  //   <binop> scratch, dest, incr
452  //   stxr stxr_status, scratch, ptr
453  //   cbnz stxr_status, loopMBB
454  //   fallthrough --> exitMBB
455  BB = loopMBB;
456  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
457  if (BinOpcode) {
458    // All arithmetic operations we'll be creating are designed to take an extra
459    // shift or extend operand, which we can conveniently set to zero.
460
461    // Operand order needs to go the other way for NAND.
462    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
463      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
464        .addReg(incr).addReg(dest).addImm(0);
465    else
466      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
467        .addReg(dest).addReg(incr).addImm(0);
468  }
469
470  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
471  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
472  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
473
474  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
475  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
476    .addReg(stxr_status).addMBB(loopMBB);
477
478  BB->addSuccessor(loopMBB);
479  BB->addSuccessor(exitMBB);
480
481  //  exitMBB:
482  //   ...
483  BB = exitMBB;
484
485  MI->eraseFromParent();   // The instruction is gone now.
486
487  return BB;
488}
489
490MachineBasicBlock *
491AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
492                                              MachineBasicBlock *BB,
493                                              unsigned Size,
494                                              unsigned CmpOp,
495                                              A64CC::CondCodes Cond) const {
496  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
497
498  const BasicBlock *LLVM_BB = BB->getBasicBlock();
499  MachineFunction *MF = BB->getParent();
500  MachineFunction::iterator It = BB;
501  ++It;
502
503  unsigned dest = MI->getOperand(0).getReg();
504  unsigned ptr = MI->getOperand(1).getReg();
505  unsigned incr = MI->getOperand(2).getReg();
506  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
507
508  unsigned oldval = dest;
509  DebugLoc dl = MI->getDebugLoc();
510
511  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
512  const TargetRegisterClass *TRC, *TRCsp;
513  if (Size == 8) {
514    TRC = &AArch64::GPR64RegClass;
515    TRCsp = &AArch64::GPR64xspRegClass;
516  } else {
517    TRC = &AArch64::GPR32RegClass;
518    TRCsp = &AArch64::GPR32wspRegClass;
519  }
520
521  unsigned ldrOpc, strOpc;
522  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
523
524  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
525  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
526  MF->insert(It, loopMBB);
527  MF->insert(It, exitMBB);
528
529  // Transfer the remainder of BB and its successor edges to exitMBB.
530  exitMBB->splice(exitMBB->begin(), BB,
531                  llvm::next(MachineBasicBlock::iterator(MI)),
532                  BB->end());
533  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
534
535  unsigned scratch = MRI.createVirtualRegister(TRC);
536  MRI.constrainRegClass(scratch, TRCsp);
537
538  //  thisMBB:
539  //   ...
540  //   fallthrough --> loopMBB
541  BB->addSuccessor(loopMBB);
542
543  //  loopMBB:
544  //   ldxr dest, ptr
545  //   cmp incr, dest (, sign extend if necessary)
546  //   csel scratch, dest, incr, cond
547  //   stxr stxr_status, scratch, ptr
548  //   cbnz stxr_status, loopMBB
549  //   fallthrough --> exitMBB
550  BB = loopMBB;
551  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
552
553  // Build compare and cmov instructions.
554  MRI.constrainRegClass(incr, TRCsp);
555  BuildMI(BB, dl, TII->get(CmpOp))
556    .addReg(incr).addReg(oldval).addImm(0);
557
558  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
559          scratch)
560    .addReg(oldval).addReg(incr).addImm(Cond);
561
562  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
563  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
564
565  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
566    .addReg(scratch).addReg(ptr);
567  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
568    .addReg(stxr_status).addMBB(loopMBB);
569
570  BB->addSuccessor(loopMBB);
571  BB->addSuccessor(exitMBB);
572
573  //  exitMBB:
574  //   ...
575  BB = exitMBB;
576
577  MI->eraseFromParent();   // The instruction is gone now.
578
579  return BB;
580}
581
582MachineBasicBlock *
583AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
584                                         MachineBasicBlock *BB,
585                                         unsigned Size) const {
586  unsigned dest    = MI->getOperand(0).getReg();
587  unsigned ptr     = MI->getOperand(1).getReg();
588  unsigned oldval  = MI->getOperand(2).getReg();
589  unsigned newval  = MI->getOperand(3).getReg();
590  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
591  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
592  DebugLoc dl = MI->getDebugLoc();
593
594  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
595  const TargetRegisterClass *TRCsp;
596  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
597
598  unsigned ldrOpc, strOpc;
599  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
600
601  MachineFunction *MF = BB->getParent();
602  const BasicBlock *LLVM_BB = BB->getBasicBlock();
603  MachineFunction::iterator It = BB;
604  ++It; // insert the new blocks after the current block
605
606  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
607  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
608  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
609  MF->insert(It, loop1MBB);
610  MF->insert(It, loop2MBB);
611  MF->insert(It, exitMBB);
612
613  // Transfer the remainder of BB and its successor edges to exitMBB.
614  exitMBB->splice(exitMBB->begin(), BB,
615                  llvm::next(MachineBasicBlock::iterator(MI)),
616                  BB->end());
617  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
618
619  //  thisMBB:
620  //   ...
621  //   fallthrough --> loop1MBB
622  BB->addSuccessor(loop1MBB);
623
624  // loop1MBB:
625  //   ldxr dest, [ptr]
626  //   cmp dest, oldval
627  //   b.ne exitMBB
628  BB = loop1MBB;
629  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
630
631  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
632  MRI.constrainRegClass(dest, TRCsp);
633  BuildMI(BB, dl, TII->get(CmpOp))
634    .addReg(dest).addReg(oldval).addImm(0);
635  BuildMI(BB, dl, TII->get(AArch64::Bcc))
636    .addImm(A64CC::NE).addMBB(exitMBB);
637  BB->addSuccessor(loop2MBB);
638  BB->addSuccessor(exitMBB);
639
640  // loop2MBB:
641  //   strex stxr_status, newval, [ptr]
642  //   cbnz stxr_status, loop1MBB
643  BB = loop2MBB;
644  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
645  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
646
647  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
648  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
649    .addReg(stxr_status).addMBB(loop1MBB);
650  BB->addSuccessor(loop1MBB);
651  BB->addSuccessor(exitMBB);
652
653  //  exitMBB:
654  //   ...
655  BB = exitMBB;
656
657  MI->eraseFromParent();   // The instruction is gone now.
658
659  return BB;
660}
661
662MachineBasicBlock *
663AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
664                                    MachineBasicBlock *MBB) const {
665  // We materialise the F128CSEL pseudo-instruction using conditional branches
666  // and loads, giving an instruciton sequence like:
667  //     str q0, [sp]
668  //     b.ne IfTrue
669  //     b Finish
670  // IfTrue:
671  //     str q1, [sp]
672  // Finish:
673  //     ldr q0, [sp]
674  //
675  // Using virtual registers would probably not be beneficial since COPY
676  // instructions are expensive for f128 (there's no actual instruction to
677  // implement them).
678  //
679  // An alternative would be to do an integer-CSEL on some address. E.g.:
680  //     mov x0, sp
681  //     add x1, sp, #16
682  //     str q0, [x0]
683  //     str q1, [x1]
684  //     csel x0, x0, x1, ne
685  //     ldr q0, [x0]
686  //
687  // It's unclear which approach is actually optimal.
688  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
689  MachineFunction *MF = MBB->getParent();
690  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
691  DebugLoc DL = MI->getDebugLoc();
692  MachineFunction::iterator It = MBB;
693  ++It;
694
695  unsigned DestReg = MI->getOperand(0).getReg();
696  unsigned IfTrueReg = MI->getOperand(1).getReg();
697  unsigned IfFalseReg = MI->getOperand(2).getReg();
698  unsigned CondCode = MI->getOperand(3).getImm();
699  bool NZCVKilled = MI->getOperand(4).isKill();
700
701  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
702  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
703  MF->insert(It, TrueBB);
704  MF->insert(It, EndBB);
705
706  // Transfer rest of current basic-block to EndBB
707  EndBB->splice(EndBB->begin(), MBB,
708                llvm::next(MachineBasicBlock::iterator(MI)),
709                MBB->end());
710  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
711
712  // We need somewhere to store the f128 value needed.
713  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
714
715  //     [... start of incoming MBB ...]
716  //     str qIFFALSE, [sp]
717  //     b.cc IfTrue
718  //     b Done
719  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
720    .addReg(IfFalseReg)
721    .addFrameIndex(ScratchFI)
722    .addImm(0);
723  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
724    .addImm(CondCode)
725    .addMBB(TrueBB);
726  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
727    .addMBB(EndBB);
728  MBB->addSuccessor(TrueBB);
729  MBB->addSuccessor(EndBB);
730
731  if (!NZCVKilled) {
732    // NZCV is live-through TrueBB.
733    TrueBB->addLiveIn(AArch64::NZCV);
734    EndBB->addLiveIn(AArch64::NZCV);
735  }
736
737  // IfTrue:
738  //     str qIFTRUE, [sp]
739  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
740    .addReg(IfTrueReg)
741    .addFrameIndex(ScratchFI)
742    .addImm(0);
743
744  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
745  // blocks.
746  TrueBB->addSuccessor(EndBB);
747
748  // Done:
749  //     ldr qDEST, [sp]
750  //     [... rest of incoming MBB ...]
751  MachineInstr *StartOfEnd = EndBB->begin();
752  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
753    .addFrameIndex(ScratchFI)
754    .addImm(0);
755
756  MI->eraseFromParent();
757  return EndBB;
758}
759
760MachineBasicBlock *
761AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
762                                                 MachineBasicBlock *MBB) const {
763  switch (MI->getOpcode()) {
764  default: llvm_unreachable("Unhandled instruction with custom inserter");
765  case AArch64::F128CSEL:
766    return EmitF128CSEL(MI, MBB);
767  case AArch64::ATOMIC_LOAD_ADD_I8:
768    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
769  case AArch64::ATOMIC_LOAD_ADD_I16:
770    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
771  case AArch64::ATOMIC_LOAD_ADD_I32:
772    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
773  case AArch64::ATOMIC_LOAD_ADD_I64:
774    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
775
776  case AArch64::ATOMIC_LOAD_SUB_I8:
777    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
778  case AArch64::ATOMIC_LOAD_SUB_I16:
779    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
780  case AArch64::ATOMIC_LOAD_SUB_I32:
781    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
782  case AArch64::ATOMIC_LOAD_SUB_I64:
783    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
784
785  case AArch64::ATOMIC_LOAD_AND_I8:
786    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
787  case AArch64::ATOMIC_LOAD_AND_I16:
788    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
789  case AArch64::ATOMIC_LOAD_AND_I32:
790    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
791  case AArch64::ATOMIC_LOAD_AND_I64:
792    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
793
794  case AArch64::ATOMIC_LOAD_OR_I8:
795    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
796  case AArch64::ATOMIC_LOAD_OR_I16:
797    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
798  case AArch64::ATOMIC_LOAD_OR_I32:
799    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
800  case AArch64::ATOMIC_LOAD_OR_I64:
801    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
802
803  case AArch64::ATOMIC_LOAD_XOR_I8:
804    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
805  case AArch64::ATOMIC_LOAD_XOR_I16:
806    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
807  case AArch64::ATOMIC_LOAD_XOR_I32:
808    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
809  case AArch64::ATOMIC_LOAD_XOR_I64:
810    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
811
812  case AArch64::ATOMIC_LOAD_NAND_I8:
813    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
814  case AArch64::ATOMIC_LOAD_NAND_I16:
815    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
816  case AArch64::ATOMIC_LOAD_NAND_I32:
817    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
818  case AArch64::ATOMIC_LOAD_NAND_I64:
819    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
820
821  case AArch64::ATOMIC_LOAD_MIN_I8:
822    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
823  case AArch64::ATOMIC_LOAD_MIN_I16:
824    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
825  case AArch64::ATOMIC_LOAD_MIN_I32:
826    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
827  case AArch64::ATOMIC_LOAD_MIN_I64:
828    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
829
830  case AArch64::ATOMIC_LOAD_MAX_I8:
831    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
832  case AArch64::ATOMIC_LOAD_MAX_I16:
833    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
834  case AArch64::ATOMIC_LOAD_MAX_I32:
835    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
836  case AArch64::ATOMIC_LOAD_MAX_I64:
837    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
838
839  case AArch64::ATOMIC_LOAD_UMIN_I8:
840    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
841  case AArch64::ATOMIC_LOAD_UMIN_I16:
842    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
843  case AArch64::ATOMIC_LOAD_UMIN_I32:
844    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
845  case AArch64::ATOMIC_LOAD_UMIN_I64:
846    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
847
848  case AArch64::ATOMIC_LOAD_UMAX_I8:
849    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
850  case AArch64::ATOMIC_LOAD_UMAX_I16:
851    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
852  case AArch64::ATOMIC_LOAD_UMAX_I32:
853    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
854  case AArch64::ATOMIC_LOAD_UMAX_I64:
855    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
856
857  case AArch64::ATOMIC_SWAP_I8:
858    return emitAtomicBinary(MI, MBB, 1, 0);
859  case AArch64::ATOMIC_SWAP_I16:
860    return emitAtomicBinary(MI, MBB, 2, 0);
861  case AArch64::ATOMIC_SWAP_I32:
862    return emitAtomicBinary(MI, MBB, 4, 0);
863  case AArch64::ATOMIC_SWAP_I64:
864    return emitAtomicBinary(MI, MBB, 8, 0);
865
866  case AArch64::ATOMIC_CMP_SWAP_I8:
867    return emitAtomicCmpSwap(MI, MBB, 1);
868  case AArch64::ATOMIC_CMP_SWAP_I16:
869    return emitAtomicCmpSwap(MI, MBB, 2);
870  case AArch64::ATOMIC_CMP_SWAP_I32:
871    return emitAtomicCmpSwap(MI, MBB, 4);
872  case AArch64::ATOMIC_CMP_SWAP_I64:
873    return emitAtomicCmpSwap(MI, MBB, 8);
874  }
875}
876
877
878const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
879  switch (Opcode) {
880  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
881  case AArch64ISD::Call:           return "AArch64ISD::Call";
882  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
883  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
884  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
885  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
886  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
887  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
888  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
889  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
890  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
891  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
892  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
893  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
894  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
895
896  case AArch64ISD::NEON_BSL:
897    return "AArch64ISD::NEON_BSL";
898  case AArch64ISD::NEON_MOVIMM:
899    return "AArch64ISD::NEON_MOVIMM";
900  case AArch64ISD::NEON_MVNIMM:
901    return "AArch64ISD::NEON_MVNIMM";
902  case AArch64ISD::NEON_FMOVIMM:
903    return "AArch64ISD::NEON_FMOVIMM";
904  case AArch64ISD::NEON_CMP:
905    return "AArch64ISD::NEON_CMP";
906  case AArch64ISD::NEON_CMPZ:
907    return "AArch64ISD::NEON_CMPZ";
908  case AArch64ISD::NEON_TST:
909    return "AArch64ISD::NEON_TST";
910  case AArch64ISD::NEON_QSHLs:
911    return "AArch64ISD::NEON_QSHLs";
912  case AArch64ISD::NEON_QSHLu:
913    return "AArch64ISD::NEON_QSHLu";
914  case AArch64ISD::NEON_VDUP:
915    return "AArch64ISD::NEON_VDUP";
916  case AArch64ISD::NEON_VDUPLANE:
917    return "AArch64ISD::NEON_VDUPLANE";
918  case AArch64ISD::NEON_REV16:
919    return "AArch64ISD::NEON_REV16";
920  case AArch64ISD::NEON_REV32:
921    return "AArch64ISD::NEON_REV32";
922  case AArch64ISD::NEON_REV64:
923    return "AArch64ISD::NEON_REV64";
924  case AArch64ISD::NEON_UZP1:
925    return "AArch64ISD::NEON_UZP1";
926  case AArch64ISD::NEON_UZP2:
927    return "AArch64ISD::NEON_UZP2";
928  case AArch64ISD::NEON_ZIP1:
929    return "AArch64ISD::NEON_ZIP1";
930  case AArch64ISD::NEON_ZIP2:
931    return "AArch64ISD::NEON_ZIP2";
932  case AArch64ISD::NEON_TRN1:
933    return "AArch64ISD::NEON_TRN1";
934  case AArch64ISD::NEON_TRN2:
935    return "AArch64ISD::NEON_TRN2";
936  case AArch64ISD::NEON_LD1_UPD:
937    return "AArch64ISD::NEON_LD1_UPD";
938  case AArch64ISD::NEON_LD2_UPD:
939    return "AArch64ISD::NEON_LD2_UPD";
940  case AArch64ISD::NEON_LD3_UPD:
941    return "AArch64ISD::NEON_LD3_UPD";
942  case AArch64ISD::NEON_LD4_UPD:
943    return "AArch64ISD::NEON_LD4_UPD";
944  case AArch64ISD::NEON_ST1_UPD:
945    return "AArch64ISD::NEON_ST1_UPD";
946  case AArch64ISD::NEON_ST2_UPD:
947    return "AArch64ISD::NEON_ST2_UPD";
948  case AArch64ISD::NEON_ST3_UPD:
949    return "AArch64ISD::NEON_ST3_UPD";
950  case AArch64ISD::NEON_ST4_UPD:
951    return "AArch64ISD::NEON_ST4_UPD";
952  case AArch64ISD::NEON_LD1x2_UPD:
953    return "AArch64ISD::NEON_LD1x2_UPD";
954  case AArch64ISD::NEON_LD1x3_UPD:
955    return "AArch64ISD::NEON_LD1x3_UPD";
956  case AArch64ISD::NEON_LD1x4_UPD:
957    return "AArch64ISD::NEON_LD1x4_UPD";
958  case AArch64ISD::NEON_ST1x2_UPD:
959    return "AArch64ISD::NEON_ST1x2_UPD";
960  case AArch64ISD::NEON_ST1x3_UPD:
961    return "AArch64ISD::NEON_ST1x3_UPD";
962  case AArch64ISD::NEON_ST1x4_UPD:
963    return "AArch64ISD::NEON_ST1x4_UPD";
964  case AArch64ISD::NEON_LD2DUP:
965    return "AArch64ISD::NEON_LD2DUP";
966  case AArch64ISD::NEON_LD3DUP:
967    return "AArch64ISD::NEON_LD3DUP";
968  case AArch64ISD::NEON_LD4DUP:
969    return "AArch64ISD::NEON_LD4DUP";
970  case AArch64ISD::NEON_LD2DUP_UPD:
971    return "AArch64ISD::NEON_LD2DUP_UPD";
972  case AArch64ISD::NEON_LD3DUP_UPD:
973    return "AArch64ISD::NEON_LD3DUP_UPD";
974  case AArch64ISD::NEON_LD4DUP_UPD:
975    return "AArch64ISD::NEON_LD4DUP_UPD";
976  case AArch64ISD::NEON_LD2LN_UPD:
977    return "AArch64ISD::NEON_LD2LN_UPD";
978  case AArch64ISD::NEON_LD3LN_UPD:
979    return "AArch64ISD::NEON_LD3LN_UPD";
980  case AArch64ISD::NEON_LD4LN_UPD:
981    return "AArch64ISD::NEON_LD4LN_UPD";
982  case AArch64ISD::NEON_ST2LN_UPD:
983    return "AArch64ISD::NEON_ST2LN_UPD";
984  case AArch64ISD::NEON_ST3LN_UPD:
985    return "AArch64ISD::NEON_ST3LN_UPD";
986  case AArch64ISD::NEON_ST4LN_UPD:
987    return "AArch64ISD::NEON_ST4LN_UPD";
988  case AArch64ISD::NEON_VEXTRACT:
989    return "AArch64ISD::NEON_VEXTRACT";
990  default:
991    return NULL;
992  }
993}
994
995static const uint16_t AArch64FPRArgRegs[] = {
996  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
997  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
998};
999static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
1000
1001static const uint16_t AArch64ArgRegs[] = {
1002  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
1003  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
1004};
1005static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
1006
1007static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
1008                                 CCValAssign::LocInfo LocInfo,
1009                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
1010  // Mark all remaining general purpose registers as allocated. We don't
1011  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
1012  // i64 will go in registers (C.11).
1013  for (unsigned i = 0; i < NumArgRegs; ++i)
1014    State.AllocateReg(AArch64ArgRegs[i]);
1015
1016  return false;
1017}
1018
1019#include "AArch64GenCallingConv.inc"
1020
1021CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1022
1023  switch(CC) {
1024  default: llvm_unreachable("Unsupported calling convention");
1025  case CallingConv::Fast:
1026  case CallingConv::C:
1027    return CC_A64_APCS;
1028  }
1029}
1030
1031void
1032AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
1033                                           SDLoc DL, SDValue &Chain) const {
1034  MachineFunction &MF = DAG.getMachineFunction();
1035  MachineFrameInfo *MFI = MF.getFrameInfo();
1036  AArch64MachineFunctionInfo *FuncInfo
1037    = MF.getInfo<AArch64MachineFunctionInfo>();
1038
1039  SmallVector<SDValue, 8> MemOps;
1040
1041  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
1042                                                         NumArgRegs);
1043  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
1044                                                         NumFPRArgRegs);
1045
1046  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
1047  int GPRIdx = 0;
1048  if (GPRSaveSize != 0) {
1049    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
1050
1051    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
1052
1053    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
1054      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
1055      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
1056      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1057                                   MachinePointerInfo::getStack(i * 8),
1058                                   false, false, 0);
1059      MemOps.push_back(Store);
1060      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1061                        DAG.getConstant(8, getPointerTy()));
1062    }
1063  }
1064
1065  if (getSubtarget()->hasFPARMv8()) {
1066  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
1067  int FPRIdx = 0;
1068    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
1069    // can omit a register save area if we know we'll never use registers of
1070    // that class.
1071    if (FPRSaveSize != 0) {
1072      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
1073
1074      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
1075
1076      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
1077        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
1078            &AArch64::FPR128RegClass);
1079        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
1080        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
1081            MachinePointerInfo::getStack(i * 16),
1082            false, false, 0);
1083        MemOps.push_back(Store);
1084        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
1085            DAG.getConstant(16, getPointerTy()));
1086      }
1087    }
1088    FuncInfo->setVariadicFPRIdx(FPRIdx);
1089    FuncInfo->setVariadicFPRSize(FPRSaveSize);
1090  }
1091
1092  int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
1093
1094  FuncInfo->setVariadicStackIdx(StackIdx);
1095  FuncInfo->setVariadicGPRIdx(GPRIdx);
1096  FuncInfo->setVariadicGPRSize(GPRSaveSize);
1097
1098  if (!MemOps.empty()) {
1099    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
1100                        MemOps.size());
1101  }
1102}
1103
1104
1105SDValue
1106AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
1107                                      CallingConv::ID CallConv, bool isVarArg,
1108                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1109                                      SDLoc dl, SelectionDAG &DAG,
1110                                      SmallVectorImpl<SDValue> &InVals) const {
1111  MachineFunction &MF = DAG.getMachineFunction();
1112  AArch64MachineFunctionInfo *FuncInfo
1113    = MF.getInfo<AArch64MachineFunctionInfo>();
1114  MachineFrameInfo *MFI = MF.getFrameInfo();
1115  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1116
1117  SmallVector<CCValAssign, 16> ArgLocs;
1118  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1119                 getTargetMachine(), ArgLocs, *DAG.getContext());
1120  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1121
1122  SmallVector<SDValue, 16> ArgValues;
1123
1124  SDValue ArgValue;
1125  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1126    CCValAssign &VA = ArgLocs[i];
1127    ISD::ArgFlagsTy Flags = Ins[i].Flags;
1128
1129    if (Flags.isByVal()) {
1130      // Byval is used for small structs and HFAs in the PCS, but the system
1131      // should work in a non-compliant manner for larger structs.
1132      EVT PtrTy = getPointerTy();
1133      int Size = Flags.getByValSize();
1134      unsigned NumRegs = (Size + 7) / 8;
1135
1136      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
1137                                                 VA.getLocMemOffset(),
1138                                                 false);
1139      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
1140      InVals.push_back(FrameIdxN);
1141
1142      continue;
1143    } else if (VA.isRegLoc()) {
1144      MVT RegVT = VA.getLocVT();
1145      const TargetRegisterClass *RC = getRegClassFor(RegVT);
1146      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1147
1148      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1149    } else { // VA.isRegLoc()
1150      assert(VA.isMemLoc());
1151
1152      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
1153                                      VA.getLocMemOffset(), true);
1154
1155      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1156      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
1157                             MachinePointerInfo::getFixedStack(FI),
1158                             false, false, false, 0);
1159
1160
1161    }
1162
1163    switch (VA.getLocInfo()) {
1164    default: llvm_unreachable("Unknown loc info!");
1165    case CCValAssign::Full: break;
1166    case CCValAssign::BCvt:
1167      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
1168      break;
1169    case CCValAssign::SExt:
1170    case CCValAssign::ZExt:
1171    case CCValAssign::AExt: {
1172      unsigned DestSize = VA.getValVT().getSizeInBits();
1173      unsigned DestSubReg;
1174
1175      switch (DestSize) {
1176      case 8: DestSubReg = AArch64::sub_8; break;
1177      case 16: DestSubReg = AArch64::sub_16; break;
1178      case 32: DestSubReg = AArch64::sub_32; break;
1179      case 64: DestSubReg = AArch64::sub_64; break;
1180      default: llvm_unreachable("Unexpected argument promotion");
1181      }
1182
1183      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1184                                   VA.getValVT(), ArgValue,
1185                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
1186                         0);
1187      break;
1188    }
1189    }
1190
1191    InVals.push_back(ArgValue);
1192  }
1193
1194  if (isVarArg)
1195    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
1196
1197  unsigned StackArgSize = CCInfo.getNextStackOffset();
1198  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
1199    // This is a non-standard ABI so by fiat I say we're allowed to make full
1200    // use of the stack area to be popped, which must be aligned to 16 bytes in
1201    // any case:
1202    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
1203
1204    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
1205    // a multiple of 16.
1206    FuncInfo->setArgumentStackToRestore(StackArgSize);
1207
1208    // This realignment carries over to the available bytes below. Our own
1209    // callers will guarantee the space is free by giving an aligned value to
1210    // CALLSEQ_START.
1211  }
1212  // Even if we're not expected to free up the space, it's useful to know how
1213  // much is there while considering tail calls (because we can reuse it).
1214  FuncInfo->setBytesInStackArgArea(StackArgSize);
1215
1216  return Chain;
1217}
1218
1219SDValue
1220AArch64TargetLowering::LowerReturn(SDValue Chain,
1221                                   CallingConv::ID CallConv, bool isVarArg,
1222                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
1223                                   const SmallVectorImpl<SDValue> &OutVals,
1224                                   SDLoc dl, SelectionDAG &DAG) const {
1225  // CCValAssign - represent the assignment of the return value to a location.
1226  SmallVector<CCValAssign, 16> RVLocs;
1227
1228  // CCState - Info about the registers and stack slots.
1229  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1230                 getTargetMachine(), RVLocs, *DAG.getContext());
1231
1232  // Analyze outgoing return values.
1233  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
1234
1235  SDValue Flag;
1236  SmallVector<SDValue, 4> RetOps(1, Chain);
1237
1238  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
1239    // PCS: "If the type, T, of the result of a function is such that
1240    // void func(T arg) would require that arg be passed as a value in a
1241    // register (or set of registers) according to the rules in 5.4, then the
1242    // result is returned in the same registers as would be used for such an
1243    // argument.
1244    //
1245    // Otherwise, the caller shall reserve a block of memory of sufficient
1246    // size and alignment to hold the result. The address of the memory block
1247    // shall be passed as an additional argument to the function in x8."
1248    //
1249    // This is implemented in two places. The register-return values are dealt
1250    // with here, more complex returns are passed as an sret parameter, which
1251    // means we don't have to worry about it during actual return.
1252    CCValAssign &VA = RVLocs[i];
1253    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
1254
1255
1256    SDValue Arg = OutVals[i];
1257
1258    // There's no convenient note in the ABI about this as there is for normal
1259    // arguments, but it says return values are passed in the same registers as
1260    // an argument would be. I believe that includes the comments about
1261    // unspecified higher bits, putting the burden of widening on the *caller*
1262    // for return values.
1263    switch (VA.getLocInfo()) {
1264    default: llvm_unreachable("Unknown loc info");
1265    case CCValAssign::Full: break;
1266    case CCValAssign::SExt:
1267    case CCValAssign::ZExt:
1268    case CCValAssign::AExt:
1269      // Floating-point values should only be extended when they're going into
1270      // memory, which can't happen here so an integer extend is acceptable.
1271      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1272      break;
1273    case CCValAssign::BCvt:
1274      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1275      break;
1276    }
1277
1278    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1279    Flag = Chain.getValue(1);
1280    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1281  }
1282
1283  RetOps[0] = Chain;  // Update chain.
1284
1285  // Add the flag if we have it.
1286  if (Flag.getNode())
1287    RetOps.push_back(Flag);
1288
1289  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
1290                     &RetOps[0], RetOps.size());
1291}
1292
1293SDValue
1294AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
1295                                 SmallVectorImpl<SDValue> &InVals) const {
1296  SelectionDAG &DAG                     = CLI.DAG;
1297  SDLoc &dl                             = CLI.DL;
1298  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1299  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1300  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1301  SDValue Chain                         = CLI.Chain;
1302  SDValue Callee                        = CLI.Callee;
1303  bool &IsTailCall                      = CLI.IsTailCall;
1304  CallingConv::ID CallConv              = CLI.CallConv;
1305  bool IsVarArg                         = CLI.IsVarArg;
1306
1307  MachineFunction &MF = DAG.getMachineFunction();
1308  AArch64MachineFunctionInfo *FuncInfo
1309    = MF.getInfo<AArch64MachineFunctionInfo>();
1310  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
1311  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
1312  bool IsSibCall = false;
1313
1314  if (IsTailCall) {
1315    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1316                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1317                                                   Outs, OutVals, Ins, DAG);
1318
1319    // A sibling call is one where we're under the usual C ABI and not planning
1320    // to change that but can still do a tail call:
1321    if (!TailCallOpt && IsTailCall)
1322      IsSibCall = true;
1323  }
1324
1325  SmallVector<CCValAssign, 16> ArgLocs;
1326  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1327                 getTargetMachine(), ArgLocs, *DAG.getContext());
1328  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1329
1330  // On AArch64 (and all other architectures I'm aware of) the most this has to
1331  // do is adjust the stack pointer.
1332  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
1333  if (IsSibCall) {
1334    // Since we're not changing the ABI to make this a tail call, the memory
1335    // operands are already available in the caller's incoming argument space.
1336    NumBytes = 0;
1337  }
1338
1339  // FPDiff is the byte offset of the call's argument area from the callee's.
1340  // Stores to callee stack arguments will be placed in FixedStackSlots offset
1341  // by this amount for a tail call. In a sibling call it must be 0 because the
1342  // caller will deallocate the entire stack and the callee still expects its
1343  // arguments to begin at SP+0. Completely unused for non-tail calls.
1344  int FPDiff = 0;
1345
1346  if (IsTailCall && !IsSibCall) {
1347    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
1348
1349    // FPDiff will be negative if this tail call requires more space than we
1350    // would automatically have in our incoming argument space. Positive if we
1351    // can actually shrink the stack.
1352    FPDiff = NumReusableBytes - NumBytes;
1353
1354    // The stack pointer must be 16-byte aligned at all times it's used for a
1355    // memory operation, which in practice means at *all* times and in
1356    // particular across call boundaries. Therefore our own arguments started at
1357    // a 16-byte aligned SP and the delta applied for the tail call should
1358    // satisfy the same constraint.
1359    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
1360  }
1361
1362  if (!IsSibCall)
1363    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1364                                 dl);
1365
1366  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
1367                                        getPointerTy());
1368
1369  SmallVector<SDValue, 8> MemOpChains;
1370  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1371
1372  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1373    CCValAssign &VA = ArgLocs[i];
1374    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1375    SDValue Arg = OutVals[i];
1376
1377    // Callee does the actual widening, so all extensions just use an implicit
1378    // definition of the rest of the Loc. Aesthetically, this would be nicer as
1379    // an ANY_EXTEND, but that isn't valid for floating-point types and this
1380    // alternative works on integer types too.
1381    switch (VA.getLocInfo()) {
1382    default: llvm_unreachable("Unknown loc info!");
1383    case CCValAssign::Full: break;
1384    case CCValAssign::SExt:
1385    case CCValAssign::ZExt:
1386    case CCValAssign::AExt: {
1387      unsigned SrcSize = VA.getValVT().getSizeInBits();
1388      unsigned SrcSubReg;
1389
1390      switch (SrcSize) {
1391      case 8: SrcSubReg = AArch64::sub_8; break;
1392      case 16: SrcSubReg = AArch64::sub_16; break;
1393      case 32: SrcSubReg = AArch64::sub_32; break;
1394      case 64: SrcSubReg = AArch64::sub_64; break;
1395      default: llvm_unreachable("Unexpected argument promotion");
1396      }
1397
1398      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
1399                                    VA.getLocVT(),
1400                                    DAG.getUNDEF(VA.getLocVT()),
1401                                    Arg,
1402                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
1403                    0);
1404
1405      break;
1406    }
1407    case CCValAssign::BCvt:
1408      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1409      break;
1410    }
1411
1412    if (VA.isRegLoc()) {
1413      // A normal register (sub-) argument. For now we just note it down because
1414      // we want to copy things into registers as late as possible to avoid
1415      // register-pressure (and possibly worse).
1416      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1417      continue;
1418    }
1419
1420    assert(VA.isMemLoc() && "unexpected argument location");
1421
1422    SDValue DstAddr;
1423    MachinePointerInfo DstInfo;
1424    if (IsTailCall) {
1425      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
1426                                          VA.getLocVT().getSizeInBits();
1427      OpSize = (OpSize + 7) / 8;
1428      int32_t Offset = VA.getLocMemOffset() + FPDiff;
1429      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
1430
1431      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
1432      DstInfo = MachinePointerInfo::getFixedStack(FI);
1433
1434      // Make sure any stack arguments overlapping with where we're storing are
1435      // loaded before this eventual operation. Otherwise they'll be clobbered.
1436      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
1437    } else {
1438      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
1439
1440      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1441      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
1442    }
1443
1444    if (Flags.isByVal()) {
1445      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
1446      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
1447                                  Flags.getByValAlign(),
1448                                  /*isVolatile = */ false,
1449                                  /*alwaysInline = */ false,
1450                                  DstInfo, MachinePointerInfo(0));
1451      MemOpChains.push_back(Cpy);
1452    } else {
1453      // Normal stack argument, put it where it's needed.
1454      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
1455                                   false, false, 0);
1456      MemOpChains.push_back(Store);
1457    }
1458  }
1459
1460  // The loads and stores generated above shouldn't clash with each
1461  // other. Combining them with this TokenFactor notes that fact for the rest of
1462  // the backend.
1463  if (!MemOpChains.empty())
1464    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1465                        &MemOpChains[0], MemOpChains.size());
1466
1467  // Most of the rest of the instructions need to be glued together; we don't
1468  // want assignments to actual registers used by a call to be rearranged by a
1469  // well-meaning scheduler.
1470  SDValue InFlag;
1471
1472  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1473    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1474                             RegsToPass[i].second, InFlag);
1475    InFlag = Chain.getValue(1);
1476  }
1477
1478  // The linker is responsible for inserting veneers when necessary to put a
1479  // function call destination in range, so we don't need to bother with a
1480  // wrapper here.
1481  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1482    const GlobalValue *GV = G->getGlobal();
1483    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
1484  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1485    const char *Sym = S->getSymbol();
1486    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
1487  }
1488
1489  // We don't usually want to end the call-sequence here because we would tidy
1490  // the frame up *after* the call, however in the ABI-changing tail-call case
1491  // we've carefully laid out the parameters so that when sp is reset they'll be
1492  // in the correct location.
1493  if (IsTailCall && !IsSibCall) {
1494    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1495                               DAG.getIntPtrConstant(0, true), InFlag, dl);
1496    InFlag = Chain.getValue(1);
1497  }
1498
1499  // We produce the following DAG scheme for the actual call instruction:
1500  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
1501  //
1502  // Most arguments aren't going to be used and just keep the values live as
1503  // far as LLVM is concerned. It's expected to be selected as simply "bl
1504  // callee" (for a direct, non-tail call).
1505  std::vector<SDValue> Ops;
1506  Ops.push_back(Chain);
1507  Ops.push_back(Callee);
1508
1509  if (IsTailCall) {
1510    // Each tail call may have to adjust the stack by a different amount, so
1511    // this information must travel along with the operation for eventual
1512    // consumption by emitEpilogue.
1513    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
1514  }
1515
1516  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1517    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1518                                  RegsToPass[i].second.getValueType()));
1519
1520
1521  // Add a register mask operand representing the call-preserved registers. This
1522  // is used later in codegen to constrain register-allocation.
1523  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1524  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
1525  assert(Mask && "Missing call preserved mask for calling convention");
1526  Ops.push_back(DAG.getRegisterMask(Mask));
1527
1528  // If we needed glue, put it in as the last argument.
1529  if (InFlag.getNode())
1530    Ops.push_back(InFlag);
1531
1532  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1533
1534  if (IsTailCall) {
1535    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1536  }
1537
1538  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
1539  InFlag = Chain.getValue(1);
1540
1541  // Now we can reclaim the stack, just as well do it before working out where
1542  // our return value is.
1543  if (!IsSibCall) {
1544    uint64_t CalleePopBytes
1545      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
1546
1547    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1548                               DAG.getIntPtrConstant(CalleePopBytes, true),
1549                               InFlag, dl);
1550    InFlag = Chain.getValue(1);
1551  }
1552
1553  return LowerCallResult(Chain, InFlag, CallConv,
1554                         IsVarArg, Ins, dl, DAG, InVals);
1555}
1556
1557SDValue
1558AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1559                                      CallingConv::ID CallConv, bool IsVarArg,
1560                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1561                                      SDLoc dl, SelectionDAG &DAG,
1562                                      SmallVectorImpl<SDValue> &InVals) const {
1563  // Assign locations to each value returned by this call.
1564  SmallVector<CCValAssign, 16> RVLocs;
1565  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
1566                 getTargetMachine(), RVLocs, *DAG.getContext());
1567  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
1568
1569  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1570    CCValAssign VA = RVLocs[i];
1571
1572    // Return values that are too big to fit into registers should use an sret
1573    // pointer, so this can be a lot simpler than the main argument code.
1574    assert(VA.isRegLoc() && "Memory locations not expected for call return");
1575
1576    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1577                                     InFlag);
1578    Chain = Val.getValue(1);
1579    InFlag = Val.getValue(2);
1580
1581    switch (VA.getLocInfo()) {
1582    default: llvm_unreachable("Unknown loc info!");
1583    case CCValAssign::Full: break;
1584    case CCValAssign::BCvt:
1585      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1586      break;
1587    case CCValAssign::ZExt:
1588    case CCValAssign::SExt:
1589    case CCValAssign::AExt:
1590      // Floating-point arguments only get extended/truncated if they're going
1591      // in memory, so using the integer operation is acceptable here.
1592      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
1593      break;
1594    }
1595
1596    InVals.push_back(Val);
1597  }
1598
1599  return Chain;
1600}
1601
1602bool
1603AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1604                                    CallingConv::ID CalleeCC,
1605                                    bool IsVarArg,
1606                                    bool IsCalleeStructRet,
1607                                    bool IsCallerStructRet,
1608                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1609                                    const SmallVectorImpl<SDValue> &OutVals,
1610                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1611                                    SelectionDAG& DAG) const {
1612
1613  // For CallingConv::C this function knows whether the ABI needs
1614  // changing. That's not true for other conventions so they will have to opt in
1615  // manually.
1616  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
1617    return false;
1618
1619  const MachineFunction &MF = DAG.getMachineFunction();
1620  const Function *CallerF = MF.getFunction();
1621  CallingConv::ID CallerCC = CallerF->getCallingConv();
1622  bool CCMatch = CallerCC == CalleeCC;
1623
1624  // Byval parameters hand the function a pointer directly into the stack area
1625  // we want to reuse during a tail call. Working around this *is* possible (see
1626  // X86) but less efficient and uglier in LowerCall.
1627  for (Function::const_arg_iterator i = CallerF->arg_begin(),
1628         e = CallerF->arg_end(); i != e; ++i)
1629    if (i->hasByValAttr())
1630      return false;
1631
1632  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
1633    if (IsTailCallConvention(CalleeCC) && CCMatch)
1634      return true;
1635    return false;
1636  }
1637
1638  // Now we search for cases where we can use a tail call without changing the
1639  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
1640  // concept.
1641
1642  // I want anyone implementing a new calling convention to think long and hard
1643  // about this assert.
1644  assert((!IsVarArg || CalleeCC == CallingConv::C)
1645         && "Unexpected variadic calling convention");
1646
1647  if (IsVarArg && !Outs.empty()) {
1648    // At least two cases here: if caller is fastcc then we can't have any
1649    // memory arguments (we'd be expected to clean up the stack afterwards). If
1650    // caller is C then we could potentially use its argument area.
1651
1652    // FIXME: for now we take the most conservative of these in both cases:
1653    // disallow all variadic memory operands.
1654    SmallVector<CCValAssign, 16> ArgLocs;
1655    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1656                   getTargetMachine(), ArgLocs, *DAG.getContext());
1657
1658    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1659    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
1660      if (!ArgLocs[i].isRegLoc())
1661        return false;
1662  }
1663
1664  // If the calling conventions do not match, then we'd better make sure the
1665  // results are returned in the same way as what the caller expects.
1666  if (!CCMatch) {
1667    SmallVector<CCValAssign, 16> RVLocs1;
1668    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1669                    getTargetMachine(), RVLocs1, *DAG.getContext());
1670    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
1671
1672    SmallVector<CCValAssign, 16> RVLocs2;
1673    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1674                    getTargetMachine(), RVLocs2, *DAG.getContext());
1675    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
1676
1677    if (RVLocs1.size() != RVLocs2.size())
1678      return false;
1679    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1680      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1681        return false;
1682      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1683        return false;
1684      if (RVLocs1[i].isRegLoc()) {
1685        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1686          return false;
1687      } else {
1688        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1689          return false;
1690      }
1691    }
1692  }
1693
1694  // Nothing more to check if the callee is taking no arguments
1695  if (Outs.empty())
1696    return true;
1697
1698  SmallVector<CCValAssign, 16> ArgLocs;
1699  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
1700                 getTargetMachine(), ArgLocs, *DAG.getContext());
1701
1702  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
1703
1704  const AArch64MachineFunctionInfo *FuncInfo
1705    = MF.getInfo<AArch64MachineFunctionInfo>();
1706
1707  // If the stack arguments for this call would fit into our own save area then
1708  // the call can be made tail.
1709  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
1710}
1711
1712bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
1713                                                   bool TailCallOpt) const {
1714  return CallCC == CallingConv::Fast && TailCallOpt;
1715}
1716
1717bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
1718  return CallCC == CallingConv::Fast;
1719}
1720
1721SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
1722                                                   SelectionDAG &DAG,
1723                                                   MachineFrameInfo *MFI,
1724                                                   int ClobberedFI) const {
1725  SmallVector<SDValue, 8> ArgChains;
1726  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
1727  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
1728
1729  // Include the original chain at the beginning of the list. When this is
1730  // used by target LowerCall hooks, this helps legalize find the
1731  // CALLSEQ_BEGIN node.
1732  ArgChains.push_back(Chain);
1733
1734  // Add a chain value for each stack argument corresponding
1735  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1736         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
1737    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
1738      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
1739        if (FI->getIndex() < 0) {
1740          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
1741          int64_t InLastByte = InFirstByte;
1742          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
1743
1744          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1745              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1746            ArgChains.push_back(SDValue(L, 1));
1747        }
1748
1749   // Build a tokenfactor for all the chains.
1750   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
1751                      &ArgChains[0], ArgChains.size());
1752}
1753
1754static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
1755  switch (CC) {
1756  case ISD::SETEQ:  return A64CC::EQ;
1757  case ISD::SETGT:  return A64CC::GT;
1758  case ISD::SETGE:  return A64CC::GE;
1759  case ISD::SETLT:  return A64CC::LT;
1760  case ISD::SETLE:  return A64CC::LE;
1761  case ISD::SETNE:  return A64CC::NE;
1762  case ISD::SETUGT: return A64CC::HI;
1763  case ISD::SETUGE: return A64CC::HS;
1764  case ISD::SETULT: return A64CC::LO;
1765  case ISD::SETULE: return A64CC::LS;
1766  default: llvm_unreachable("Unexpected condition code");
1767  }
1768}
1769
1770bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
1771  // icmp is implemented using adds/subs immediate, which take an unsigned
1772  // 12-bit immediate, optionally shifted left by 12 bits.
1773
1774  // Symmetric by using adds/subs
1775  if (Val < 0)
1776    Val = -Val;
1777
1778  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
1779}
1780
1781SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
1782                                        ISD::CondCode CC, SDValue &A64cc,
1783                                        SelectionDAG &DAG, SDLoc &dl) const {
1784  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1785    int64_t C = 0;
1786    EVT VT = RHSC->getValueType(0);
1787    bool knownInvalid = false;
1788
1789    // I'm not convinced the rest of LLVM handles these edge cases properly, but
1790    // we can at least get it right.
1791    if (isSignedIntSetCC(CC)) {
1792      C = RHSC->getSExtValue();
1793    } else if (RHSC->getZExtValue() > INT64_MAX) {
1794      // A 64-bit constant not representable by a signed 64-bit integer is far
1795      // too big to fit into a SUBS immediate anyway.
1796      knownInvalid = true;
1797    } else {
1798      C = RHSC->getZExtValue();
1799    }
1800
1801    if (!knownInvalid && !isLegalICmpImmediate(C)) {
1802      // Constant does not fit, try adjusting it by one?
1803      switch (CC) {
1804      default: break;
1805      case ISD::SETLT:
1806      case ISD::SETGE:
1807        if (isLegalICmpImmediate(C-1)) {
1808          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1809          RHS = DAG.getConstant(C-1, VT);
1810        }
1811        break;
1812      case ISD::SETULT:
1813      case ISD::SETUGE:
1814        if (isLegalICmpImmediate(C-1)) {
1815          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1816          RHS = DAG.getConstant(C-1, VT);
1817        }
1818        break;
1819      case ISD::SETLE:
1820      case ISD::SETGT:
1821        if (isLegalICmpImmediate(C+1)) {
1822          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1823          RHS = DAG.getConstant(C+1, VT);
1824        }
1825        break;
1826      case ISD::SETULE:
1827      case ISD::SETUGT:
1828        if (isLegalICmpImmediate(C+1)) {
1829          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1830          RHS = DAG.getConstant(C+1, VT);
1831        }
1832        break;
1833      }
1834    }
1835  }
1836
1837  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
1838  A64cc = DAG.getConstant(CondCode, MVT::i32);
1839  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
1840                     DAG.getCondCode(CC));
1841}
1842
1843static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
1844                                    A64CC::CondCodes &Alternative) {
1845  A64CC::CondCodes CondCode = A64CC::Invalid;
1846  Alternative = A64CC::Invalid;
1847
1848  switch (CC) {
1849  default: llvm_unreachable("Unknown FP condition!");
1850  case ISD::SETEQ:
1851  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
1852  case ISD::SETGT:
1853  case ISD::SETOGT: CondCode = A64CC::GT; break;
1854  case ISD::SETGE:
1855  case ISD::SETOGE: CondCode = A64CC::GE; break;
1856  case ISD::SETOLT: CondCode = A64CC::MI; break;
1857  case ISD::SETOLE: CondCode = A64CC::LS; break;
1858  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
1859  case ISD::SETO:   CondCode = A64CC::VC; break;
1860  case ISD::SETUO:  CondCode = A64CC::VS; break;
1861  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
1862  case ISD::SETUGT: CondCode = A64CC::HI; break;
1863  case ISD::SETUGE: CondCode = A64CC::PL; break;
1864  case ISD::SETLT:
1865  case ISD::SETULT: CondCode = A64CC::LT; break;
1866  case ISD::SETLE:
1867  case ISD::SETULE: CondCode = A64CC::LE; break;
1868  case ISD::SETNE:
1869  case ISD::SETUNE: CondCode = A64CC::NE; break;
1870  }
1871  return CondCode;
1872}
1873
1874SDValue
1875AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
1876  SDLoc DL(Op);
1877  EVT PtrVT = getPointerTy();
1878  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
1879
1880  switch(getTargetMachine().getCodeModel()) {
1881  case CodeModel::Small:
1882    // The most efficient code is PC-relative anyway for the small memory model,
1883    // so we don't need to worry about relocation model.
1884    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
1885                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
1886                                                 AArch64II::MO_NO_FLAG),
1887                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
1888                                                 AArch64II::MO_LO12),
1889                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
1890  case CodeModel::Large:
1891    return DAG.getNode(
1892      AArch64ISD::WrapperLarge, DL, PtrVT,
1893      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
1894      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
1895      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
1896      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
1897  default:
1898    llvm_unreachable("Only small and large code models supported now");
1899  }
1900}
1901
1902
1903// (BRCOND chain, val, dest)
1904SDValue
1905AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1906  SDLoc dl(Op);
1907  SDValue Chain = Op.getOperand(0);
1908  SDValue TheBit = Op.getOperand(1);
1909  SDValue DestBB = Op.getOperand(2);
1910
1911  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
1912  // that as the consumer we are responsible for ignoring rubbish in higher
1913  // bits.
1914  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
1915                       DAG.getConstant(1, MVT::i32));
1916
1917  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
1918                               DAG.getConstant(0, TheBit.getValueType()),
1919                               DAG.getCondCode(ISD::SETNE));
1920
1921  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
1922                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
1923                     DestBB);
1924}
1925
1926// (BR_CC chain, condcode, lhs, rhs, dest)
1927SDValue
1928AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
1929  SDLoc dl(Op);
1930  SDValue Chain = Op.getOperand(0);
1931  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
1932  SDValue LHS = Op.getOperand(2);
1933  SDValue RHS = Op.getOperand(3);
1934  SDValue DestBB = Op.getOperand(4);
1935
1936  if (LHS.getValueType() == MVT::f128) {
1937    // f128 comparisons are lowered to runtime calls by a routine which sets
1938    // LHS, RHS and CC appropriately for the rest of this function to continue.
1939    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
1940
1941    // If softenSetCCOperands returned a scalar, we need to compare the result
1942    // against zero to select between true and false values.
1943    if (RHS.getNode() == 0) {
1944      RHS = DAG.getConstant(0, LHS.getValueType());
1945      CC = ISD::SETNE;
1946    }
1947  }
1948
1949  if (LHS.getValueType().isInteger()) {
1950    SDValue A64cc;
1951
1952    // Integers are handled in a separate function because the combinations of
1953    // immediates and tests can get hairy and we may want to fiddle things.
1954    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
1955
1956    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1957                       Chain, CmpOp, A64cc, DestBB);
1958  }
1959
1960  // Note that some LLVM floating-point CondCodes can't be lowered to a single
1961  // conditional branch, hence FPCCToA64CC can set a second test, where either
1962  // passing is sufficient.
1963  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
1964  CondCode = FPCCToA64CC(CC, Alternative);
1965  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
1966  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
1967                              DAG.getCondCode(CC));
1968  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1969                                 Chain, SetCC, A64cc, DestBB);
1970
1971  if (Alternative != A64CC::Invalid) {
1972    A64cc = DAG.getConstant(Alternative, MVT::i32);
1973    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
1974                           A64BR_CC, SetCC, A64cc, DestBB);
1975
1976  }
1977
1978  return A64BR_CC;
1979}
1980
1981SDValue
1982AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
1983                                       RTLIB::Libcall Call) const {
1984  ArgListTy Args;
1985  ArgListEntry Entry;
1986  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
1987    EVT ArgVT = Op.getOperand(i).getValueType();
1988    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
1989    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
1990    Entry.isSExt = false;
1991    Entry.isZExt = false;
1992    Args.push_back(Entry);
1993  }
1994  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
1995
1996  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
1997
1998  // By default, the input chain to this libcall is the entry node of the
1999  // function. If the libcall is going to be emitted as a tail call then
2000  // isUsedByReturnOnly will change it to the right chain if the return
2001  // node which is being folded has a non-entry input chain.
2002  SDValue InChain = DAG.getEntryNode();
2003
2004  // isTailCall may be true since the callee does not reference caller stack
2005  // frame. Check if it's in the right position.
2006  SDValue TCChain = InChain;
2007  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
2008  if (isTailCall)
2009    InChain = TCChain;
2010
2011  TargetLowering::
2012  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
2013                    0, getLibcallCallingConv(Call), isTailCall,
2014                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
2015                    Callee, Args, DAG, SDLoc(Op));
2016  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
2017
2018  if (!CallInfo.second.getNode())
2019    // It's a tailcall, return the chain (which is the DAG root).
2020    return DAG.getRoot();
2021
2022  return CallInfo.first;
2023}
2024
2025SDValue
2026AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
2027  if (Op.getOperand(0).getValueType() != MVT::f128) {
2028    // It's legal except when f128 is involved
2029    return Op;
2030  }
2031
2032  RTLIB::Libcall LC;
2033  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
2034
2035  SDValue SrcVal = Op.getOperand(0);
2036  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
2037                     /*isSigned*/ false, SDLoc(Op)).first;
2038}
2039
2040SDValue
2041AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
2042  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2043
2044  RTLIB::Libcall LC;
2045  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2046
2047  return LowerF128ToCall(Op, DAG, LC);
2048}
2049
2050SDValue
2051AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2052                                      bool IsSigned) const {
2053  if (Op.getOperand(0).getValueType() != MVT::f128) {
2054    // It's legal except when f128 is involved
2055    return Op;
2056  }
2057
2058  RTLIB::Libcall LC;
2059  if (IsSigned)
2060    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
2061  else
2062    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
2063
2064  return LowerF128ToCall(Op, DAG, LC);
2065}
2066
2067SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
2068  MachineFunction &MF = DAG.getMachineFunction();
2069  MachineFrameInfo *MFI = MF.getFrameInfo();
2070  MFI->setReturnAddressIsTaken(true);
2071
2072  EVT VT = Op.getValueType();
2073  SDLoc dl(Op);
2074  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2075  if (Depth) {
2076    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
2077    SDValue Offset = DAG.getConstant(8, MVT::i64);
2078    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
2079                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
2080                       MachinePointerInfo(), false, false, false, 0);
2081  }
2082
2083  // Return X30, which contains the return address. Mark it an implicit live-in.
2084  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
2085  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
2086}
2087
2088
2089SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
2090                                              const {
2091  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
2092  MFI->setFrameAddressIsTaken(true);
2093
2094  EVT VT = Op.getValueType();
2095  SDLoc dl(Op);
2096  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2097  unsigned FrameReg = AArch64::X29;
2098  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
2099  while (Depth--)
2100    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
2101                            MachinePointerInfo(),
2102                            false, false, false, 0);
2103  return FrameAddr;
2104}
2105
2106SDValue
2107AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
2108                                                  SelectionDAG &DAG) const {
2109  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
2110  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
2111
2112  EVT PtrVT = getPointerTy();
2113  SDLoc dl(Op);
2114  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2115  const GlobalValue *GV = GN->getGlobal();
2116
2117  SDValue GlobalAddr = DAG.getNode(
2118      AArch64ISD::WrapperLarge, dl, PtrVT,
2119      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
2120      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
2121      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
2122      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
2123
2124  if (GN->getOffset() != 0)
2125    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2126                       DAG.getConstant(GN->getOffset(), PtrVT));
2127
2128  return GlobalAddr;
2129}
2130
2131SDValue
2132AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
2133                                                  SelectionDAG &DAG) const {
2134  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
2135
2136  EVT PtrVT = getPointerTy();
2137  SDLoc dl(Op);
2138  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
2139  const GlobalValue *GV = GN->getGlobal();
2140  unsigned Alignment = GV->getAlignment();
2141  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2142  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
2143    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
2144    // to zero when they remain undefined. In PIC mode the GOT can take care of
2145    // this, but in absolute mode we use a constant pool load.
2146    SDValue PoolAddr;
2147    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2148                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2149                                                     AArch64II::MO_NO_FLAG),
2150                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
2151                                                     AArch64II::MO_LO12),
2152                           DAG.getConstant(8, MVT::i32));
2153    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
2154                                     MachinePointerInfo::getConstantPool(),
2155                                     /*isVolatile=*/ false,
2156                                     /*isNonTemporal=*/ true,
2157                                     /*isInvariant=*/ true, 8);
2158    if (GN->getOffset() != 0)
2159      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
2160                         DAG.getConstant(GN->getOffset(), PtrVT));
2161
2162    return GlobalAddr;
2163  }
2164
2165  if (Alignment == 0) {
2166    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
2167    if (GVPtrTy->getElementType()->isSized()) {
2168      Alignment
2169        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
2170    } else {
2171      // Be conservative if we can't guess, not that it really matters:
2172      // functions and labels aren't valid for loads, and the methods used to
2173      // actually calculate an address work with any alignment.
2174      Alignment = 1;
2175    }
2176  }
2177
2178  unsigned char HiFixup, LoFixup;
2179  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
2180
2181  if (UseGOT) {
2182    HiFixup = AArch64II::MO_GOT;
2183    LoFixup = AArch64II::MO_GOT_LO12;
2184    Alignment = 8;
2185  } else {
2186    HiFixup = AArch64II::MO_NO_FLAG;
2187    LoFixup = AArch64II::MO_LO12;
2188  }
2189
2190  // AArch64's small model demands the following sequence:
2191  // ADRP x0, somewhere
2192  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
2193  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2194                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2195                                                             HiFixup),
2196                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
2197                                                             LoFixup),
2198                                  DAG.getConstant(Alignment, MVT::i32));
2199
2200  if (UseGOT) {
2201    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
2202                            GlobalRef);
2203  }
2204
2205  if (GN->getOffset() != 0)
2206    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
2207                       DAG.getConstant(GN->getOffset(), PtrVT));
2208
2209  return GlobalRef;
2210}
2211
2212SDValue
2213AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
2214                                             SelectionDAG &DAG) const {
2215  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
2216  // we make those distinctions here.
2217
2218  switch (getTargetMachine().getCodeModel()) {
2219  case CodeModel::Small:
2220    return LowerGlobalAddressELFSmall(Op, DAG);
2221  case CodeModel::Large:
2222    return LowerGlobalAddressELFLarge(Op, DAG);
2223  default:
2224    llvm_unreachable("Only small and large code models supported now");
2225  }
2226}
2227
2228SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
2229                                                SDValue DescAddr,
2230                                                SDLoc DL,
2231                                                SelectionDAG &DAG) const {
2232  EVT PtrVT = getPointerTy();
2233
2234  // The function we need to call is simply the first entry in the GOT for this
2235  // descriptor, load it in preparation.
2236  SDValue Func, Chain;
2237  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2238                     DescAddr);
2239
2240  // The function takes only one argument: the address of the descriptor itself
2241  // in X0.
2242  SDValue Glue;
2243  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
2244  Glue = Chain.getValue(1);
2245
2246  // Finally, there's a special calling-convention which means that the lookup
2247  // must preserve all registers (except X0, obviously).
2248  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
2249  const AArch64RegisterInfo *A64RI
2250    = static_cast<const AArch64RegisterInfo *>(TRI);
2251  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
2252
2253  // We're now ready to populate the argument list, as with a normal call:
2254  std::vector<SDValue> Ops;
2255  Ops.push_back(Chain);
2256  Ops.push_back(Func);
2257  Ops.push_back(SymAddr);
2258  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
2259  Ops.push_back(DAG.getRegisterMask(Mask));
2260  Ops.push_back(Glue);
2261
2262  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2263  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
2264                      Ops.size());
2265  Glue = Chain.getValue(1);
2266
2267  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
2268  // back to the generic handling code.
2269  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
2270}
2271
2272SDValue
2273AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
2274                                             SelectionDAG &DAG) const {
2275  assert(getSubtarget()->isTargetELF() &&
2276         "TLS not implemented for non-ELF targets");
2277  assert(getTargetMachine().getCodeModel() == CodeModel::Small
2278         && "TLS only supported in small memory model");
2279  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2280
2281  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
2282
2283  SDValue TPOff;
2284  EVT PtrVT = getPointerTy();
2285  SDLoc DL(Op);
2286  const GlobalValue *GV = GA->getGlobal();
2287
2288  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
2289
2290  if (Model == TLSModel::InitialExec) {
2291    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2292                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2293                                                   AArch64II::MO_GOTTPREL),
2294                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2295                                                   AArch64II::MO_GOTTPREL_LO12),
2296                        DAG.getConstant(8, MVT::i32));
2297    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
2298                        TPOff);
2299  } else if (Model == TLSModel::LocalExec) {
2300    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2301                                               AArch64II::MO_TPREL_G1);
2302    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2303                                               AArch64II::MO_TPREL_G0_NC);
2304
2305    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2306                                       DAG.getTargetConstant(1, MVT::i32)), 0);
2307    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2308                                       TPOff, LoVar,
2309                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2310  } else if (Model == TLSModel::GeneralDynamic) {
2311    // Accesses used in this sequence go via the TLS descriptor which lives in
2312    // the GOT. Prepare an address we can use to handle this.
2313    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2314                                                AArch64II::MO_TLSDESC);
2315    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
2316                                                AArch64II::MO_TLSDESC_LO12);
2317    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2318                                   HiDesc, LoDesc,
2319                                   DAG.getConstant(8, MVT::i32));
2320    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
2321
2322    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2323  } else if (Model == TLSModel::LocalDynamic) {
2324    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
2325    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
2326    // the beginning of the module's TLS region, followed by a DTPREL offset
2327    // calculation.
2328
2329    // These accesses will need deduplicating if there's more than one.
2330    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
2331      .getInfo<AArch64MachineFunctionInfo>();
2332    MFI->incNumLocalDynamicTLSAccesses();
2333
2334
2335    // Get the location of _TLS_MODULE_BASE_:
2336    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2337                                                AArch64II::MO_TLSDESC);
2338    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
2339                                                AArch64II::MO_TLSDESC_LO12);
2340    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
2341                                   HiDesc, LoDesc,
2342                                   DAG.getConstant(8, MVT::i32));
2343    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
2344
2345    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
2346
2347    // Get the variable's offset from _TLS_MODULE_BASE_
2348    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2349                                               AArch64II::MO_DTPREL_G1);
2350    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
2351                                               AArch64II::MO_DTPREL_G0_NC);
2352
2353    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
2354                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2355    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
2356                                       TPOff, LoVar,
2357                                       DAG.getTargetConstant(0, MVT::i32)), 0);
2358  } else
2359      llvm_unreachable("Unsupported TLS access model");
2360
2361
2362  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
2363}
2364
2365SDValue
2366AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2367                                      bool IsSigned) const {
2368  if (Op.getValueType() != MVT::f128) {
2369    // Legal for everything except f128.
2370    return Op;
2371  }
2372
2373  RTLIB::Libcall LC;
2374  if (IsSigned)
2375    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2376  else
2377    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2378
2379  return LowerF128ToCall(Op, DAG, LC);
2380}
2381
2382
2383SDValue
2384AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
2385  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
2386  SDLoc dl(JT);
2387  EVT PtrVT = getPointerTy();
2388
2389  // When compiling PIC, jump tables get put in the code section so a static
2390  // relocation-style is acceptable for both cases.
2391  switch (getTargetMachine().getCodeModel()) {
2392  case CodeModel::Small:
2393    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
2394                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
2395                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
2396                                              AArch64II::MO_LO12),
2397                       DAG.getConstant(1, MVT::i32));
2398  case CodeModel::Large:
2399    return DAG.getNode(
2400      AArch64ISD::WrapperLarge, dl, PtrVT,
2401      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
2402      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
2403      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
2404      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
2405  default:
2406    llvm_unreachable("Only small and large code models supported now");
2407  }
2408}
2409
2410// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
2411SDValue
2412AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
2413  SDLoc dl(Op);
2414  SDValue LHS = Op.getOperand(0);
2415  SDValue RHS = Op.getOperand(1);
2416  SDValue IfTrue = Op.getOperand(2);
2417  SDValue IfFalse = Op.getOperand(3);
2418  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2419
2420  if (LHS.getValueType() == MVT::f128) {
2421    // f128 comparisons are lowered to libcalls, but slot in nicely here
2422    // afterwards.
2423    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2424
2425    // If softenSetCCOperands returned a scalar, we need to compare the result
2426    // against zero to select between true and false values.
2427    if (RHS.getNode() == 0) {
2428      RHS = DAG.getConstant(0, LHS.getValueType());
2429      CC = ISD::SETNE;
2430    }
2431  }
2432
2433  if (LHS.getValueType().isInteger()) {
2434    SDValue A64cc;
2435
2436    // Integers are handled in a separate function because the combinations of
2437    // immediates and tests can get hairy and we may want to fiddle things.
2438    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2439
2440    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2441                       CmpOp, IfTrue, IfFalse, A64cc);
2442  }
2443
2444  // Note that some LLVM floating-point CondCodes can't be lowered to a single
2445  // conditional branch, hence FPCCToA64CC can set a second test, where either
2446  // passing is sufficient.
2447  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2448  CondCode = FPCCToA64CC(CC, Alternative);
2449  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2450  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2451                              DAG.getCondCode(CC));
2452  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
2453                                     Op.getValueType(),
2454                                     SetCC, IfTrue, IfFalse, A64cc);
2455
2456  if (Alternative != A64CC::Invalid) {
2457    A64cc = DAG.getConstant(Alternative, MVT::i32);
2458    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2459                               SetCC, IfTrue, A64SELECT_CC, A64cc);
2460
2461  }
2462
2463  return A64SELECT_CC;
2464}
2465
2466// (SELECT testbit, iftrue, iffalse)
2467SDValue
2468AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2469  SDLoc dl(Op);
2470  SDValue TheBit = Op.getOperand(0);
2471  SDValue IfTrue = Op.getOperand(1);
2472  SDValue IfFalse = Op.getOperand(2);
2473
2474  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
2475  // that as the consumer we are responsible for ignoring rubbish in higher
2476  // bits.
2477  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
2478                       DAG.getConstant(1, MVT::i32));
2479  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
2480                               DAG.getConstant(0, TheBit.getValueType()),
2481                               DAG.getCondCode(ISD::SETNE));
2482
2483  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
2484                     A64CMP, IfTrue, IfFalse,
2485                     DAG.getConstant(A64CC::NE, MVT::i32));
2486}
2487
2488static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
2489  SDLoc DL(Op);
2490  SDValue LHS = Op.getOperand(0);
2491  SDValue RHS = Op.getOperand(1);
2492  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2493  EVT VT = Op.getValueType();
2494  bool Invert = false;
2495  SDValue Op0, Op1;
2496  unsigned Opcode;
2497
2498  if (LHS.getValueType().isInteger()) {
2499
2500    // Attempt to use Vector Integer Compare Mask Test instruction.
2501    // TST = icmp ne (and (op0, op1), zero).
2502    if (CC == ISD::SETNE) {
2503      if (((LHS.getOpcode() == ISD::AND) &&
2504           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
2505          ((RHS.getOpcode() == ISD::AND) &&
2506           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
2507
2508        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
2509        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
2510        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
2511        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
2512      }
2513    }
2514
2515    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
2516    // Note: Compare against Zero does not support unsigned predicates.
2517    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2518         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
2519        !isUnsignedIntSetCC(CC)) {
2520
2521      // If LHS is the zero value, swap operands and CondCode.
2522      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2523        CC = getSetCCSwappedOperands(CC);
2524        Op0 = RHS;
2525      } else
2526        Op0 = LHS;
2527
2528      // Ensure valid CondCode for Compare Mask against Zero instruction:
2529      // EQ, GE, GT, LE, LT.
2530      if (ISD::SETNE == CC) {
2531        Invert = true;
2532        CC = ISD::SETEQ;
2533      }
2534
2535      // Using constant type to differentiate integer and FP compares with zero.
2536      Op1 = DAG.getConstant(0, MVT::i32);
2537      Opcode = AArch64ISD::NEON_CMPZ;
2538
2539    } else {
2540      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
2541      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
2542      bool Swap = false;
2543      switch (CC) {
2544      default:
2545        llvm_unreachable("Illegal integer comparison.");
2546      case ISD::SETEQ:
2547      case ISD::SETGT:
2548      case ISD::SETGE:
2549      case ISD::SETUGT:
2550      case ISD::SETUGE:
2551        break;
2552      case ISD::SETNE:
2553        Invert = true;
2554        CC = ISD::SETEQ;
2555        break;
2556      case ISD::SETULT:
2557      case ISD::SETULE:
2558      case ISD::SETLT:
2559      case ISD::SETLE:
2560        Swap = true;
2561        CC = getSetCCSwappedOperands(CC);
2562      }
2563
2564      if (Swap)
2565        std::swap(LHS, RHS);
2566
2567      Opcode = AArch64ISD::NEON_CMP;
2568      Op0 = LHS;
2569      Op1 = RHS;
2570    }
2571
2572    // Generate Compare Mask instr or Compare Mask against Zero instr.
2573    SDValue NeonCmp =
2574        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2575
2576    if (Invert)
2577      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2578
2579    return NeonCmp;
2580  }
2581
2582  // Now handle Floating Point cases.
2583  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
2584  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
2585      ISD::isBuildVectorAllZeros(LHS.getNode())) {
2586
2587    // If LHS is the zero value, swap operands and CondCode.
2588    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
2589      CC = getSetCCSwappedOperands(CC);
2590      Op0 = RHS;
2591    } else
2592      Op0 = LHS;
2593
2594    // Using constant type to differentiate integer and FP compares with zero.
2595    Op1 = DAG.getConstantFP(0, MVT::f32);
2596    Opcode = AArch64ISD::NEON_CMPZ;
2597  } else {
2598    // Attempt to use Vector Floating Point Compare Mask instruction.
2599    Op0 = LHS;
2600    Op1 = RHS;
2601    Opcode = AArch64ISD::NEON_CMP;
2602  }
2603
2604  SDValue NeonCmpAlt;
2605  // Some register compares have to be implemented with swapped CC and operands,
2606  // e.g.: OLT implemented as OGT with swapped operands.
2607  bool SwapIfRegArgs = false;
2608
2609  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
2610  // EQ, GE, GT, LE, LT.
2611  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
2612  switch (CC) {
2613  default:
2614    llvm_unreachable("Illegal FP comparison");
2615  case ISD::SETUNE:
2616  case ISD::SETNE:
2617    Invert = true; // Fallthrough
2618  case ISD::SETOEQ:
2619  case ISD::SETEQ:
2620    CC = ISD::SETEQ;
2621    break;
2622  case ISD::SETOLT:
2623  case ISD::SETLT:
2624    CC = ISD::SETLT;
2625    SwapIfRegArgs = true;
2626    break;
2627  case ISD::SETOGT:
2628  case ISD::SETGT:
2629    CC = ISD::SETGT;
2630    break;
2631  case ISD::SETOLE:
2632  case ISD::SETLE:
2633    CC = ISD::SETLE;
2634    SwapIfRegArgs = true;
2635    break;
2636  case ISD::SETOGE:
2637  case ISD::SETGE:
2638    CC = ISD::SETGE;
2639    break;
2640  case ISD::SETUGE:
2641    Invert = true;
2642    CC = ISD::SETLT;
2643    SwapIfRegArgs = true;
2644    break;
2645  case ISD::SETULE:
2646    Invert = true;
2647    CC = ISD::SETGT;
2648    break;
2649  case ISD::SETUGT:
2650    Invert = true;
2651    CC = ISD::SETLE;
2652    SwapIfRegArgs = true;
2653    break;
2654  case ISD::SETULT:
2655    Invert = true;
2656    CC = ISD::SETGE;
2657    break;
2658  case ISD::SETUEQ:
2659    Invert = true; // Fallthrough
2660  case ISD::SETONE:
2661    // Expand this to (OGT |OLT).
2662    NeonCmpAlt =
2663        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
2664    CC = ISD::SETLT;
2665    SwapIfRegArgs = true;
2666    break;
2667  case ISD::SETUO:
2668    Invert = true; // Fallthrough
2669  case ISD::SETO:
2670    // Expand this to (OGE | OLT).
2671    NeonCmpAlt =
2672        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
2673    CC = ISD::SETLT;
2674    SwapIfRegArgs = true;
2675    break;
2676  }
2677
2678  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
2679    CC = getSetCCSwappedOperands(CC);
2680    std::swap(Op0, Op1);
2681  }
2682
2683  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
2684  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
2685
2686  if (NeonCmpAlt.getNode())
2687    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
2688
2689  if (Invert)
2690    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
2691
2692  return NeonCmp;
2693}
2694
2695// (SETCC lhs, rhs, condcode)
2696SDValue
2697AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
2698  SDLoc dl(Op);
2699  SDValue LHS = Op.getOperand(0);
2700  SDValue RHS = Op.getOperand(1);
2701  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2702  EVT VT = Op.getValueType();
2703
2704  if (VT.isVector())
2705    return LowerVectorSETCC(Op, DAG);
2706
2707  if (LHS.getValueType() == MVT::f128) {
2708    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
2709    // for the rest of the function (some i32 or i64 values).
2710    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
2711
2712    // If softenSetCCOperands returned a scalar, use it.
2713    if (RHS.getNode() == 0) {
2714      assert(LHS.getValueType() == Op.getValueType() &&
2715             "Unexpected setcc expansion!");
2716      return LHS;
2717    }
2718  }
2719
2720  if (LHS.getValueType().isInteger()) {
2721    SDValue A64cc;
2722
2723    // Integers are handled in a separate function because the combinations of
2724    // immediates and tests can get hairy and we may want to fiddle things.
2725    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
2726
2727    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2728                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
2729                       A64cc);
2730  }
2731
2732  // Note that some LLVM floating-point CondCodes can't be lowered to a single
2733  // conditional branch, hence FPCCToA64CC can set a second test, where either
2734  // passing is sufficient.
2735  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
2736  CondCode = FPCCToA64CC(CC, Alternative);
2737  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
2738  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
2739                              DAG.getCondCode(CC));
2740  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
2741                                     CmpOp, DAG.getConstant(1, VT),
2742                                     DAG.getConstant(0, VT), A64cc);
2743
2744  if (Alternative != A64CC::Invalid) {
2745    A64cc = DAG.getConstant(Alternative, MVT::i32);
2746    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
2747                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
2748  }
2749
2750  return A64SELECT_CC;
2751}
2752
2753SDValue
2754AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
2755  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2756  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
2757
2758  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
2759  // rather than just 8.
2760  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
2761                       Op.getOperand(1), Op.getOperand(2),
2762                       DAG.getConstant(32, MVT::i32), 8, false, false,
2763                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
2764}
2765
2766SDValue
2767AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2768  // The layout of the va_list struct is specified in the AArch64 Procedure Call
2769  // Standard, section B.3.
2770  MachineFunction &MF = DAG.getMachineFunction();
2771  AArch64MachineFunctionInfo *FuncInfo
2772    = MF.getInfo<AArch64MachineFunctionInfo>();
2773  SDLoc DL(Op);
2774
2775  SDValue Chain = Op.getOperand(0);
2776  SDValue VAList = Op.getOperand(1);
2777  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2778  SmallVector<SDValue, 4> MemOps;
2779
2780  // void *__stack at offset 0
2781  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
2782                                    getPointerTy());
2783  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
2784                                MachinePointerInfo(SV), false, false, 0));
2785
2786  // void *__gr_top at offset 8
2787  int GPRSize = FuncInfo->getVariadicGPRSize();
2788  if (GPRSize > 0) {
2789    SDValue GRTop, GRTopAddr;
2790
2791    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2792                            DAG.getConstant(8, getPointerTy()));
2793
2794    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
2795    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
2796                        DAG.getConstant(GPRSize, getPointerTy()));
2797
2798    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
2799                                  MachinePointerInfo(SV, 8),
2800                                  false, false, 0));
2801  }
2802
2803  // void *__vr_top at offset 16
2804  int FPRSize = FuncInfo->getVariadicFPRSize();
2805  if (FPRSize > 0) {
2806    SDValue VRTop, VRTopAddr;
2807    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2808                            DAG.getConstant(16, getPointerTy()));
2809
2810    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
2811    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
2812                        DAG.getConstant(FPRSize, getPointerTy()));
2813
2814    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
2815                                  MachinePointerInfo(SV, 16),
2816                                  false, false, 0));
2817  }
2818
2819  // int __gr_offs at offset 24
2820  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2821                                   DAG.getConstant(24, getPointerTy()));
2822  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
2823                                GROffsAddr, MachinePointerInfo(SV, 24),
2824                                false, false, 0));
2825
2826  // int __vr_offs at offset 28
2827  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
2828                                   DAG.getConstant(28, getPointerTy()));
2829  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
2830                                VROffsAddr, MachinePointerInfo(SV, 28),
2831                                false, false, 0));
2832
2833  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
2834                     MemOps.size());
2835}
2836
2837SDValue
2838AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2839  switch (Op.getOpcode()) {
2840  default: llvm_unreachable("Don't know how to custom lower this!");
2841  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
2842  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
2843  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
2844  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
2845  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
2846  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
2847  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
2848  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
2849  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
2850  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
2851  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
2852  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
2853
2854  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
2855  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
2856  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
2857  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
2858  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
2859  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
2860  case ISD::SELECT: return LowerSELECT(Op, DAG);
2861  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
2862  case ISD::SETCC: return LowerSETCC(Op, DAG);
2863  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
2864  case ISD::VASTART: return LowerVASTART(Op, DAG);
2865  case ISD::BUILD_VECTOR:
2866    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
2867  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
2868  }
2869
2870  return SDValue();
2871}
2872
2873/// Check if the specified splat value corresponds to a valid vector constant
2874/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
2875/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
2876/// values.
2877static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
2878                              unsigned SplatBitSize, SelectionDAG &DAG,
2879                              bool is128Bits, NeonModImmType type, EVT &VT,
2880                              unsigned &Imm, unsigned &OpCmode) {
2881  switch (SplatBitSize) {
2882  default:
2883    llvm_unreachable("unexpected size for isNeonModifiedImm");
2884  case 8: {
2885    if (type != Neon_Mov_Imm)
2886      return false;
2887    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
2888    // Neon movi per byte: Op=0, Cmode=1110.
2889    OpCmode = 0xe;
2890    Imm = SplatBits;
2891    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
2892    break;
2893  }
2894  case 16: {
2895    // Neon move inst per halfword
2896    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
2897    if ((SplatBits & ~0xff) == 0) {
2898      // Value = 0x00nn is 0x00nn LSL 0
2899      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
2900      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
2901      // Op=x, Cmode=100y
2902      Imm = SplatBits;
2903      OpCmode = 0x8;
2904      break;
2905    }
2906    if ((SplatBits & ~0xff00) == 0) {
2907      // Value = 0xnn00 is 0x00nn LSL 8
2908      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
2909      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
2910      // Op=x, Cmode=101x
2911      Imm = SplatBits >> 8;
2912      OpCmode = 0xa;
2913      break;
2914    }
2915    // can't handle any other
2916    return false;
2917  }
2918
2919  case 32: {
2920    // First the LSL variants (MSL is unusable by some interested instructions).
2921
2922    // Neon move instr per word, shift zeros
2923    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
2924    if ((SplatBits & ~0xff) == 0) {
2925      // Value = 0x000000nn is 0x000000nn LSL 0
2926      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
2927      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
2928      // Op=x, Cmode=000x
2929      Imm = SplatBits;
2930      OpCmode = 0;
2931      break;
2932    }
2933    if ((SplatBits & ~0xff00) == 0) {
2934      // Value = 0x0000nn00 is 0x000000nn LSL 8
2935      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
2936      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
2937      // Op=x, Cmode=001x
2938      Imm = SplatBits >> 8;
2939      OpCmode = 0x2;
2940      break;
2941    }
2942    if ((SplatBits & ~0xff0000) == 0) {
2943      // Value = 0x00nn0000 is 0x000000nn LSL 16
2944      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
2945      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
2946      // Op=x, Cmode=010x
2947      Imm = SplatBits >> 16;
2948      OpCmode = 0x4;
2949      break;
2950    }
2951    if ((SplatBits & ~0xff000000) == 0) {
2952      // Value = 0xnn000000 is 0x000000nn LSL 24
2953      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
2954      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
2955      // Op=x, Cmode=011x
2956      Imm = SplatBits >> 24;
2957      OpCmode = 0x6;
2958      break;
2959    }
2960
2961    // Now the MSL immediates.
2962
2963    // Neon move instr per word, shift ones
2964    if ((SplatBits & ~0xffff) == 0 &&
2965        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
2966      // Value = 0x0000nnff is 0x000000nn MSL 8
2967      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
2968      // Op=x, Cmode=1100
2969      Imm = SplatBits >> 8;
2970      OpCmode = 0xc;
2971      break;
2972    }
2973    if ((SplatBits & ~0xffffff) == 0 &&
2974        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
2975      // Value = 0x00nnffff is 0x000000nn MSL 16
2976      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
2977      // Op=x, Cmode=1101
2978      Imm = SplatBits >> 16;
2979      OpCmode = 0xd;
2980      break;
2981    }
2982    // can't handle any other
2983    return false;
2984  }
2985
2986  case 64: {
2987    if (type != Neon_Mov_Imm)
2988      return false;
2989    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
2990    // movi Op=1, Cmode=1110.
2991    OpCmode = 0x1e;
2992    uint64_t BitMask = 0xff;
2993    uint64_t Val = 0;
2994    unsigned ImmMask = 1;
2995    Imm = 0;
2996    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
2997      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
2998        Val |= BitMask;
2999        Imm |= ImmMask;
3000      } else if ((SplatBits & BitMask) != 0) {
3001        return false;
3002      }
3003      BitMask <<= 8;
3004      ImmMask <<= 1;
3005    }
3006    SplatBits = Val;
3007    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
3008    break;
3009  }
3010  }
3011
3012  return true;
3013}
3014
3015static SDValue PerformANDCombine(SDNode *N,
3016                                 TargetLowering::DAGCombinerInfo &DCI) {
3017
3018  SelectionDAG &DAG = DCI.DAG;
3019  SDLoc DL(N);
3020  EVT VT = N->getValueType(0);
3021
3022  // We're looking for an SRA/SHL pair which form an SBFX.
3023
3024  if (VT != MVT::i32 && VT != MVT::i64)
3025    return SDValue();
3026
3027  if (!isa<ConstantSDNode>(N->getOperand(1)))
3028    return SDValue();
3029
3030  uint64_t TruncMask = N->getConstantOperandVal(1);
3031  if (!isMask_64(TruncMask))
3032    return SDValue();
3033
3034  uint64_t Width = CountPopulation_64(TruncMask);
3035  SDValue Shift = N->getOperand(0);
3036
3037  if (Shift.getOpcode() != ISD::SRL)
3038    return SDValue();
3039
3040  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3041    return SDValue();
3042  uint64_t LSB = Shift->getConstantOperandVal(1);
3043
3044  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3045    return SDValue();
3046
3047  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
3048                     DAG.getConstant(LSB, MVT::i64),
3049                     DAG.getConstant(LSB + Width - 1, MVT::i64));
3050}
3051
3052/// For a true bitfield insert, the bits getting into that contiguous mask
3053/// should come from the low part of an existing value: they must be formed from
3054/// a compatible SHL operation (unless they're already low). This function
3055/// checks that condition and returns the least-significant bit that's
3056/// intended. If the operation not a field preparation, -1 is returned.
3057static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
3058                            SDValue &MaskedVal, uint64_t Mask) {
3059  if (!isShiftedMask_64(Mask))
3060    return -1;
3061
3062  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
3063  // instruction. BFI will do a left-shift by LSB before applying the mask we've
3064  // spotted, so in general we should pre-emptively "undo" that by making sure
3065  // the incoming bits have had a right-shift applied to them.
3066  //
3067  // This right shift, however, will combine with existing left/right shifts. In
3068  // the simplest case of a completely straight bitfield operation, it will be
3069  // expected to completely cancel out with an existing SHL. More complicated
3070  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
3071  // the BFI.
3072
3073  uint64_t LSB = countTrailingZeros(Mask);
3074  int64_t ShiftRightRequired = LSB;
3075  if (MaskedVal.getOpcode() == ISD::SHL &&
3076      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3077    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
3078    MaskedVal = MaskedVal.getOperand(0);
3079  } else if (MaskedVal.getOpcode() == ISD::SRL &&
3080             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
3081    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
3082    MaskedVal = MaskedVal.getOperand(0);
3083  }
3084
3085  if (ShiftRightRequired > 0)
3086    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
3087                            DAG.getConstant(ShiftRightRequired, MVT::i64));
3088  else if (ShiftRightRequired < 0) {
3089    // We could actually end up with a residual left shift, for example with
3090    // "struc.bitfield = val << 1".
3091    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
3092                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
3093  }
3094
3095  return LSB;
3096}
3097
3098/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
3099/// a mask and an extension. Returns true if a BFI was found and provides
3100/// information on its surroundings.
3101static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
3102                          bool &Extended) {
3103  Extended = false;
3104  if (N.getOpcode() == ISD::ZERO_EXTEND) {
3105    Extended = true;
3106    N = N.getOperand(0);
3107  }
3108
3109  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
3110    Mask = N->getConstantOperandVal(1);
3111    N = N.getOperand(0);
3112  } else {
3113    // Mask is the whole width.
3114    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
3115  }
3116
3117  if (N.getOpcode() == AArch64ISD::BFI) {
3118    BFI = N;
3119    return true;
3120  }
3121
3122  return false;
3123}
3124
3125/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
3126/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
3127/// can often be further combined with a larger mask. Ultimately, we want mask
3128/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
3129static SDValue tryCombineToBFI(SDNode *N,
3130                               TargetLowering::DAGCombinerInfo &DCI,
3131                               const AArch64Subtarget *Subtarget) {
3132  SelectionDAG &DAG = DCI.DAG;
3133  SDLoc DL(N);
3134  EVT VT = N->getValueType(0);
3135
3136  assert(N->getOpcode() == ISD::OR && "Unexpected root");
3137
3138  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
3139  // abandon the effort.
3140  SDValue LHS = N->getOperand(0);
3141  if (LHS.getOpcode() != ISD::AND)
3142    return SDValue();
3143
3144  uint64_t LHSMask;
3145  if (isa<ConstantSDNode>(LHS.getOperand(1)))
3146    LHSMask = LHS->getConstantOperandVal(1);
3147  else
3148    return SDValue();
3149
3150  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
3151  // is or abandon the effort.
3152  SDValue RHS = N->getOperand(1);
3153  if (RHS.getOpcode() != ISD::AND)
3154    return SDValue();
3155
3156  uint64_t RHSMask;
3157  if (isa<ConstantSDNode>(RHS.getOperand(1)))
3158    RHSMask = RHS->getConstantOperandVal(1);
3159  else
3160    return SDValue();
3161
3162  // Can't do anything if the masks are incompatible.
3163  if (LHSMask & RHSMask)
3164    return SDValue();
3165
3166  // Now we need one of the masks to be a contiguous field. Without loss of
3167  // generality that should be the RHS one.
3168  SDValue Bitfield = LHS.getOperand(0);
3169  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
3170    // We know that LHS is a candidate new value, and RHS isn't already a better
3171    // one.
3172    std::swap(LHS, RHS);
3173    std::swap(LHSMask, RHSMask);
3174  }
3175
3176  // We've done our best to put the right operands in the right places, all we
3177  // can do now is check whether a BFI exists.
3178  Bitfield = RHS.getOperand(0);
3179  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
3180  if (LSB == -1)
3181    return SDValue();
3182
3183  uint32_t Width = CountPopulation_64(RHSMask);
3184  assert(Width && "Expected non-zero bitfield width");
3185
3186  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3187                            LHS.getOperand(0), Bitfield,
3188                            DAG.getConstant(LSB, MVT::i64),
3189                            DAG.getConstant(Width, MVT::i64));
3190
3191  // Mask is trivial
3192  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3193    return BFI;
3194
3195  return DAG.getNode(ISD::AND, DL, VT, BFI,
3196                     DAG.getConstant(LHSMask | RHSMask, VT));
3197}
3198
3199/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
3200/// original input. This is surprisingly common because SROA splits things up
3201/// into i8 chunks, so the originally detected MaskedBFI may actually only act
3202/// on the low (say) byte of a word. This is then orred into the rest of the
3203/// word afterwards.
3204///
3205/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
3206///
3207/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
3208/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
3209/// involved.
3210static SDValue tryCombineToLargerBFI(SDNode *N,
3211                                     TargetLowering::DAGCombinerInfo &DCI,
3212                                     const AArch64Subtarget *Subtarget) {
3213  SelectionDAG &DAG = DCI.DAG;
3214  SDLoc DL(N);
3215  EVT VT = N->getValueType(0);
3216
3217  // First job is to hunt for a MaskedBFI on either the left or right. Swap
3218  // operands if it's actually on the right.
3219  SDValue BFI;
3220  SDValue PossExtraMask;
3221  uint64_t ExistingMask = 0;
3222  bool Extended = false;
3223  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
3224    PossExtraMask = N->getOperand(1);
3225  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
3226    PossExtraMask = N->getOperand(0);
3227  else
3228    return SDValue();
3229
3230  // We can only combine a BFI with another compatible mask.
3231  if (PossExtraMask.getOpcode() != ISD::AND ||
3232      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
3233    return SDValue();
3234
3235  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
3236
3237  // Masks must be compatible.
3238  if (ExtraMask & ExistingMask)
3239    return SDValue();
3240
3241  SDValue OldBFIVal = BFI.getOperand(0);
3242  SDValue NewBFIVal = BFI.getOperand(1);
3243  if (Extended) {
3244    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
3245    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
3246    // need to be made compatible.
3247    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
3248           && "Invalid types for BFI");
3249    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
3250    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
3251  }
3252
3253  // We need the MaskedBFI to be combined with a mask of the *same* value.
3254  if (PossExtraMask.getOperand(0) != OldBFIVal)
3255    return SDValue();
3256
3257  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
3258                    OldBFIVal, NewBFIVal,
3259                    BFI.getOperand(2), BFI.getOperand(3));
3260
3261  // If the masking is trivial, we don't need to create it.
3262  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
3263    return BFI;
3264
3265  return DAG.getNode(ISD::AND, DL, VT, BFI,
3266                     DAG.getConstant(ExtraMask | ExistingMask, VT));
3267}
3268
3269/// An EXTR instruction is made up of two shifts, ORed together. This helper
3270/// searches for and classifies those shifts.
3271static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
3272                         bool &FromHi) {
3273  if (N.getOpcode() == ISD::SHL)
3274    FromHi = false;
3275  else if (N.getOpcode() == ISD::SRL)
3276    FromHi = true;
3277  else
3278    return false;
3279
3280  if (!isa<ConstantSDNode>(N.getOperand(1)))
3281    return false;
3282
3283  ShiftAmount = N->getConstantOperandVal(1);
3284  Src = N->getOperand(0);
3285  return true;
3286}
3287
3288/// EXTR instruction extracts a contiguous chunk of bits from two existing
3289/// registers viewed as a high/low pair. This function looks for the pattern:
3290/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
3291/// EXTR. Can't quite be done in TableGen because the two immediates aren't
3292/// independent.
3293static SDValue tryCombineToEXTR(SDNode *N,
3294                                TargetLowering::DAGCombinerInfo &DCI) {
3295  SelectionDAG &DAG = DCI.DAG;
3296  SDLoc DL(N);
3297  EVT VT = N->getValueType(0);
3298
3299  assert(N->getOpcode() == ISD::OR && "Unexpected root");
3300
3301  if (VT != MVT::i32 && VT != MVT::i64)
3302    return SDValue();
3303
3304  SDValue LHS;
3305  uint32_t ShiftLHS = 0;
3306  bool LHSFromHi = 0;
3307  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
3308    return SDValue();
3309
3310  SDValue RHS;
3311  uint32_t ShiftRHS = 0;
3312  bool RHSFromHi = 0;
3313  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
3314    return SDValue();
3315
3316  // If they're both trying to come from the high part of the register, they're
3317  // not really an EXTR.
3318  if (LHSFromHi == RHSFromHi)
3319    return SDValue();
3320
3321  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
3322    return SDValue();
3323
3324  if (LHSFromHi) {
3325    std::swap(LHS, RHS);
3326    std::swap(ShiftLHS, ShiftRHS);
3327  }
3328
3329  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
3330                     LHS, RHS,
3331                     DAG.getConstant(ShiftRHS, MVT::i64));
3332}
3333
3334/// Target-specific dag combine xforms for ISD::OR
3335static SDValue PerformORCombine(SDNode *N,
3336                                TargetLowering::DAGCombinerInfo &DCI,
3337                                const AArch64Subtarget *Subtarget) {
3338
3339  SelectionDAG &DAG = DCI.DAG;
3340  SDLoc DL(N);
3341  EVT VT = N->getValueType(0);
3342
3343  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3344    return SDValue();
3345
3346  // Attempt to recognise bitfield-insert operations.
3347  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
3348  if (Res.getNode())
3349    return Res;
3350
3351  // Attempt to combine an existing MaskedBFI operation into one with a larger
3352  // mask.
3353  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
3354  if (Res.getNode())
3355    return Res;
3356
3357  Res = tryCombineToEXTR(N, DCI);
3358  if (Res.getNode())
3359    return Res;
3360
3361  if (!Subtarget->hasNEON())
3362    return SDValue();
3363
3364  // Attempt to use vector immediate-form BSL
3365  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
3366
3367  SDValue N0 = N->getOperand(0);
3368  if (N0.getOpcode() != ISD::AND)
3369    return SDValue();
3370
3371  SDValue N1 = N->getOperand(1);
3372  if (N1.getOpcode() != ISD::AND)
3373    return SDValue();
3374
3375  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
3376    APInt SplatUndef;
3377    unsigned SplatBitSize;
3378    bool HasAnyUndefs;
3379    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
3380    APInt SplatBits0;
3381    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
3382                                      HasAnyUndefs) &&
3383        !HasAnyUndefs) {
3384      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
3385      APInt SplatBits1;
3386      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
3387                                        HasAnyUndefs) &&
3388          !HasAnyUndefs && SplatBits0 == ~SplatBits1) {
3389        // Canonicalize the vector type to make instruction selection simpler.
3390        EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8;
3391        SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT,
3392                                     N0->getOperand(1), N0->getOperand(0),
3393                                     N1->getOperand(0));
3394        return DAG.getNode(ISD::BITCAST, DL, VT, Result);
3395      }
3396    }
3397  }
3398
3399  return SDValue();
3400}
3401
3402/// Target-specific dag combine xforms for ISD::SRA
3403static SDValue PerformSRACombine(SDNode *N,
3404                                 TargetLowering::DAGCombinerInfo &DCI) {
3405
3406  SelectionDAG &DAG = DCI.DAG;
3407  SDLoc DL(N);
3408  EVT VT = N->getValueType(0);
3409
3410  // We're looking for an SRA/SHL pair which form an SBFX.
3411
3412  if (VT != MVT::i32 && VT != MVT::i64)
3413    return SDValue();
3414
3415  if (!isa<ConstantSDNode>(N->getOperand(1)))
3416    return SDValue();
3417
3418  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
3419  SDValue Shift = N->getOperand(0);
3420
3421  if (Shift.getOpcode() != ISD::SHL)
3422    return SDValue();
3423
3424  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
3425    return SDValue();
3426
3427  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
3428  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
3429  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
3430
3431  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
3432    return SDValue();
3433
3434  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
3435                     DAG.getConstant(LSB, MVT::i64),
3436                     DAG.getConstant(LSB + Width - 1, MVT::i64));
3437}
3438
3439/// Check if this is a valid build_vector for the immediate operand of
3440/// a vector shift operation, where all the elements of the build_vector
3441/// must have the same constant integer value.
3442static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
3443  // Ignore bit_converts.
3444  while (Op.getOpcode() == ISD::BITCAST)
3445    Op = Op.getOperand(0);
3446  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
3447  APInt SplatBits, SplatUndef;
3448  unsigned SplatBitSize;
3449  bool HasAnyUndefs;
3450  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
3451                                      HasAnyUndefs, ElementBits) ||
3452      SplatBitSize > ElementBits)
3453    return false;
3454  Cnt = SplatBits.getSExtValue();
3455  return true;
3456}
3457
3458/// Check if this is a valid build_vector for the immediate operand of
3459/// a vector shift left operation.  That value must be in the range:
3460/// 0 <= Value < ElementBits
3461static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
3462  assert(VT.isVector() && "vector shift count is not a vector type");
3463  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3464  if (!getVShiftImm(Op, ElementBits, Cnt))
3465    return false;
3466  return (Cnt >= 0 && Cnt < ElementBits);
3467}
3468
3469/// Check if this is a valid build_vector for the immediate operand of a
3470/// vector shift right operation. The value must be in the range:
3471///   1 <= Value <= ElementBits
3472static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
3473  assert(VT.isVector() && "vector shift count is not a vector type");
3474  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
3475  if (!getVShiftImm(Op, ElementBits, Cnt))
3476    return false;
3477  return (Cnt >= 1 && Cnt <= ElementBits);
3478}
3479
3480/// Checks for immediate versions of vector shifts and lowers them.
3481static SDValue PerformShiftCombine(SDNode *N,
3482                                   TargetLowering::DAGCombinerInfo &DCI,
3483                                   const AArch64Subtarget *ST) {
3484  SelectionDAG &DAG = DCI.DAG;
3485  EVT VT = N->getValueType(0);
3486  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
3487    return PerformSRACombine(N, DCI);
3488
3489  // Nothing to be done for scalar shifts.
3490  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3491  if (!VT.isVector() || !TLI.isTypeLegal(VT))
3492    return SDValue();
3493
3494  assert(ST->hasNEON() && "unexpected vector shift");
3495  int64_t Cnt;
3496
3497  switch (N->getOpcode()) {
3498  default:
3499    llvm_unreachable("unexpected shift opcode");
3500
3501  case ISD::SHL:
3502    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
3503      SDValue RHS =
3504          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3505                      DAG.getConstant(Cnt, MVT::i32));
3506      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
3507    }
3508    break;
3509
3510  case ISD::SRA:
3511  case ISD::SRL:
3512    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
3513      SDValue RHS =
3514          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
3515                      DAG.getConstant(Cnt, MVT::i32));
3516      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
3517    }
3518    break;
3519  }
3520
3521  return SDValue();
3522}
3523
3524/// ARM-specific DAG combining for intrinsics.
3525static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
3526  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3527
3528  switch (IntNo) {
3529  default:
3530    // Don't do anything for most intrinsics.
3531    break;
3532
3533  case Intrinsic::arm_neon_vqshifts:
3534  case Intrinsic::arm_neon_vqshiftu:
3535    EVT VT = N->getOperand(1).getValueType();
3536    int64_t Cnt;
3537    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
3538      break;
3539    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
3540                             ? AArch64ISD::NEON_QSHLs
3541                             : AArch64ISD::NEON_QSHLu;
3542    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
3543                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
3544  }
3545
3546  return SDValue();
3547}
3548
3549/// Target-specific DAG combine function for NEON load/store intrinsics
3550/// to merge base address updates.
3551static SDValue CombineBaseUpdate(SDNode *N,
3552                                 TargetLowering::DAGCombinerInfo &DCI) {
3553  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
3554    return SDValue();
3555
3556  SelectionDAG &DAG = DCI.DAG;
3557  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
3558                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
3559  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
3560  SDValue Addr = N->getOperand(AddrOpIdx);
3561
3562  // Search for a use of the address operand that is an increment.
3563  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
3564       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
3565    SDNode *User = *UI;
3566    if (User->getOpcode() != ISD::ADD ||
3567        UI.getUse().getResNo() != Addr.getResNo())
3568      continue;
3569
3570    // Check that the add is independent of the load/store.  Otherwise, folding
3571    // it would create a cycle.
3572    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
3573      continue;
3574
3575    // Find the new opcode for the updating load/store.
3576    bool isLoad = true;
3577    bool isLaneOp = false;
3578    unsigned NewOpc = 0;
3579    unsigned NumVecs = 0;
3580    if (isIntrinsic) {
3581      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3582      switch (IntNo) {
3583      default: llvm_unreachable("unexpected intrinsic for Neon base update");
3584      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
3585        NumVecs = 1; break;
3586      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
3587        NumVecs = 2; break;
3588      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
3589        NumVecs = 3; break;
3590      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
3591        NumVecs = 4; break;
3592      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
3593        NumVecs = 1; isLoad = false; break;
3594      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
3595        NumVecs = 2; isLoad = false; break;
3596      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
3597        NumVecs = 3; isLoad = false; break;
3598      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
3599        NumVecs = 4; isLoad = false; break;
3600      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
3601        NumVecs = 2; break;
3602      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
3603        NumVecs = 3; break;
3604      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
3605        NumVecs = 4; break;
3606      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
3607        NumVecs = 2; isLoad = false; break;
3608      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
3609        NumVecs = 3; isLoad = false; break;
3610      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
3611        NumVecs = 4; isLoad = false; break;
3612      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
3613        NumVecs = 2; isLaneOp = true; break;
3614      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
3615        NumVecs = 3; isLaneOp = true; break;
3616      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
3617        NumVecs = 4; isLaneOp = true; break;
3618      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
3619        NumVecs = 2; isLoad = false; isLaneOp = true; break;
3620      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
3621        NumVecs = 3; isLoad = false; isLaneOp = true; break;
3622      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
3623        NumVecs = 4; isLoad = false; isLaneOp = true; break;
3624      }
3625    } else {
3626      isLaneOp = true;
3627      switch (N->getOpcode()) {
3628      default: llvm_unreachable("unexpected opcode for Neon base update");
3629      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
3630        NumVecs = 2; break;
3631      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
3632        NumVecs = 3; break;
3633      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
3634        NumVecs = 4; break;
3635      }
3636    }
3637
3638    // Find the size of memory referenced by the load/store.
3639    EVT VecTy;
3640    if (isLoad)
3641      VecTy = N->getValueType(0);
3642    else
3643      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
3644    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
3645    if (isLaneOp)
3646      NumBytes /= VecTy.getVectorNumElements();
3647
3648    // If the increment is a constant, it must match the memory ref size.
3649    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
3650    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
3651      uint32_t IncVal = CInc->getZExtValue();
3652      if (IncVal != NumBytes)
3653        continue;
3654      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
3655    }
3656
3657    // Create the new updating load/store node.
3658    EVT Tys[6];
3659    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
3660    unsigned n;
3661    for (n = 0; n < NumResultVecs; ++n)
3662      Tys[n] = VecTy;
3663    Tys[n++] = MVT::i64;
3664    Tys[n] = MVT::Other;
3665    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
3666    SmallVector<SDValue, 8> Ops;
3667    Ops.push_back(N->getOperand(0)); // incoming chain
3668    Ops.push_back(N->getOperand(AddrOpIdx));
3669    Ops.push_back(Inc);
3670    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
3671      Ops.push_back(N->getOperand(i));
3672    }
3673    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
3674    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
3675                                           Ops.data(), Ops.size(),
3676                                           MemInt->getMemoryVT(),
3677                                           MemInt->getMemOperand());
3678
3679    // Update the uses.
3680    std::vector<SDValue> NewResults;
3681    for (unsigned i = 0; i < NumResultVecs; ++i) {
3682      NewResults.push_back(SDValue(UpdN.getNode(), i));
3683    }
3684    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
3685    DCI.CombineTo(N, NewResults);
3686    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
3687
3688    break;
3689  }
3690  return SDValue();
3691}
3692
3693/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
3694/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
3695/// If so, combine them to a vldN-dup operation and return true.
3696static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
3697  SelectionDAG &DAG = DCI.DAG;
3698  EVT VT = N->getValueType(0);
3699
3700  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
3701  SDNode *VLD = N->getOperand(0).getNode();
3702  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
3703    return SDValue();
3704  unsigned NumVecs = 0;
3705  unsigned NewOpc = 0;
3706  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
3707  if (IntNo == Intrinsic::arm_neon_vld2lane) {
3708    NumVecs = 2;
3709    NewOpc = AArch64ISD::NEON_LD2DUP;
3710  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
3711    NumVecs = 3;
3712    NewOpc = AArch64ISD::NEON_LD3DUP;
3713  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
3714    NumVecs = 4;
3715    NewOpc = AArch64ISD::NEON_LD4DUP;
3716  } else {
3717    return SDValue();
3718  }
3719
3720  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
3721  // numbers match the load.
3722  unsigned VLDLaneNo =
3723      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
3724  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3725       UI != UE; ++UI) {
3726    // Ignore uses of the chain result.
3727    if (UI.getUse().getResNo() == NumVecs)
3728      continue;
3729    SDNode *User = *UI;
3730    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
3731        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
3732      return SDValue();
3733  }
3734
3735  // Create the vldN-dup node.
3736  EVT Tys[5];
3737  unsigned n;
3738  for (n = 0; n < NumVecs; ++n)
3739    Tys[n] = VT;
3740  Tys[n] = MVT::Other;
3741  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
3742  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
3743  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
3744  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
3745                                           VLDMemInt->getMemoryVT(),
3746                                           VLDMemInt->getMemOperand());
3747
3748  // Update the uses.
3749  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
3750       UI != UE; ++UI) {
3751    unsigned ResNo = UI.getUse().getResNo();
3752    // Ignore uses of the chain result.
3753    if (ResNo == NumVecs)
3754      continue;
3755    SDNode *User = *UI;
3756    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
3757  }
3758
3759  // Now the vldN-lane intrinsic is dead except for its chain result.
3760  // Update uses of the chain.
3761  std::vector<SDValue> VLDDupResults;
3762  for (unsigned n = 0; n < NumVecs; ++n)
3763    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
3764  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
3765  DCI.CombineTo(VLD, VLDDupResults);
3766
3767  return SDValue(N, 0);
3768}
3769
3770SDValue
3771AArch64TargetLowering::PerformDAGCombine(SDNode *N,
3772                                         DAGCombinerInfo &DCI) const {
3773  switch (N->getOpcode()) {
3774  default: break;
3775  case ISD::AND: return PerformANDCombine(N, DCI);
3776  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
3777  case ISD::SHL:
3778  case ISD::SRA:
3779  case ISD::SRL:
3780    return PerformShiftCombine(N, DCI, getSubtarget());
3781  case ISD::INTRINSIC_WO_CHAIN:
3782    return PerformIntrinsicCombine(N, DCI.DAG);
3783  case AArch64ISD::NEON_VDUPLANE:
3784    return CombineVLDDUP(N, DCI);
3785  case AArch64ISD::NEON_LD2DUP:
3786  case AArch64ISD::NEON_LD3DUP:
3787  case AArch64ISD::NEON_LD4DUP:
3788    return CombineBaseUpdate(N, DCI);
3789  case ISD::INTRINSIC_VOID:
3790  case ISD::INTRINSIC_W_CHAIN:
3791    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
3792    case Intrinsic::arm_neon_vld1:
3793    case Intrinsic::arm_neon_vld2:
3794    case Intrinsic::arm_neon_vld3:
3795    case Intrinsic::arm_neon_vld4:
3796    case Intrinsic::arm_neon_vst1:
3797    case Intrinsic::arm_neon_vst2:
3798    case Intrinsic::arm_neon_vst3:
3799    case Intrinsic::arm_neon_vst4:
3800    case Intrinsic::arm_neon_vld2lane:
3801    case Intrinsic::arm_neon_vld3lane:
3802    case Intrinsic::arm_neon_vld4lane:
3803    case Intrinsic::aarch64_neon_vld1x2:
3804    case Intrinsic::aarch64_neon_vld1x3:
3805    case Intrinsic::aarch64_neon_vld1x4:
3806    case Intrinsic::aarch64_neon_vst1x2:
3807    case Intrinsic::aarch64_neon_vst1x3:
3808    case Intrinsic::aarch64_neon_vst1x4:
3809    case Intrinsic::arm_neon_vst2lane:
3810    case Intrinsic::arm_neon_vst3lane:
3811    case Intrinsic::arm_neon_vst4lane:
3812      return CombineBaseUpdate(N, DCI);
3813    default:
3814      break;
3815    }
3816  }
3817  return SDValue();
3818}
3819
3820bool
3821AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3822  VT = VT.getScalarType();
3823
3824  if (!VT.isSimple())
3825    return false;
3826
3827  switch (VT.getSimpleVT().SimpleTy) {
3828  case MVT::f16:
3829  case MVT::f32:
3830  case MVT::f64:
3831    return true;
3832  case MVT::f128:
3833    return false;
3834  default:
3835    break;
3836  }
3837
3838  return false;
3839}
3840
3841// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
3842// try to call LowerVECTOR_SHUFFLE to lower it.
3843bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
3844                                                 SDValue &Res) const {
3845  SDLoc DL(Op);
3846  EVT VT = Op.getValueType();
3847  unsigned NumElts = VT.getVectorNumElements();
3848  unsigned V0NumElts = 0;
3849  int Mask[16];
3850  SDValue V0, V1;
3851
3852  // Check if all elements are extracted from less than 3 vectors.
3853  for (unsigned i = 0; i < NumElts; ++i) {
3854    SDValue Elt = Op.getOperand(i);
3855    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3856      return false;
3857
3858    if (V0.getNode() == 0) {
3859      V0 = Elt.getOperand(0);
3860      V0NumElts = V0.getValueType().getVectorNumElements();
3861    }
3862    if (Elt.getOperand(0) == V0) {
3863      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
3864      continue;
3865    } else if (V1.getNode() == 0) {
3866      V1 = Elt.getOperand(0);
3867    }
3868    if (Elt.getOperand(0) == V1) {
3869      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
3870      Mask[i] = (Lane + V0NumElts);
3871      continue;
3872    } else {
3873      return false;
3874    }
3875  }
3876
3877  if (!V1.getNode() && V0NumElts == NumElts * 2) {
3878    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
3879                     DAG.getConstant(NumElts, MVT::i64));
3880    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
3881                     DAG.getConstant(0, MVT::i64));
3882    V0NumElts = V0.getValueType().getVectorNumElements();
3883  }
3884
3885  if (V1.getNode() && NumElts == V0NumElts &&
3886      V0NumElts == V1.getValueType().getVectorNumElements()) {
3887    SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
3888    Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
3889    return true;
3890  } else
3891    return false;
3892}
3893
3894// If this is a case we can't handle, return null and let the default
3895// expansion code take care of it.
3896SDValue
3897AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
3898                                         const AArch64Subtarget *ST) const {
3899
3900  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
3901  SDLoc DL(Op);
3902  EVT VT = Op.getValueType();
3903
3904  APInt SplatBits, SplatUndef;
3905  unsigned SplatBitSize;
3906  bool HasAnyUndefs;
3907
3908  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
3909
3910  // Note we favor lowering MOVI over MVNI.
3911  // This has implications on the definition of patterns in TableGen to select
3912  // BIC immediate instructions but not ORR immediate instructions.
3913  // If this lowering order is changed, TableGen patterns for BIC immediate and
3914  // ORR immediate instructions have to be updated.
3915  if (UseNeonMov &&
3916      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
3917    if (SplatBitSize <= 64) {
3918      // First attempt to use vector immediate-form MOVI
3919      EVT NeonMovVT;
3920      unsigned Imm = 0;
3921      unsigned OpCmode = 0;
3922
3923      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
3924                            SplatBitSize, DAG, VT.is128BitVector(),
3925                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
3926        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
3927        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
3928
3929        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
3930          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
3931                                        ImmVal, OpCmodeVal);
3932          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
3933        }
3934      }
3935
3936      // Then attempt to use vector immediate-form MVNI
3937      uint64_t NegatedImm = (~SplatBits).getZExtValue();
3938      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
3939                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
3940                            Imm, OpCmode)) {
3941        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
3942        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
3943        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
3944          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
3945                                        ImmVal, OpCmodeVal);
3946          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
3947        }
3948      }
3949
3950      // Attempt to use vector immediate-form FMOV
3951      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
3952          (VT == MVT::v2f64 && SplatBitSize == 64)) {
3953        APFloat RealVal(
3954            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
3955            SplatBits);
3956        uint32_t ImmVal;
3957        if (A64Imms::isFPImm(RealVal, ImmVal)) {
3958          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
3959          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
3960        }
3961      }
3962    }
3963  }
3964
3965  unsigned NumElts = VT.getVectorNumElements();
3966  bool isOnlyLowElement = true;
3967  bool usesOnlyOneValue = true;
3968  bool hasDominantValue = false;
3969  bool isConstant = true;
3970
3971  // Map of the number of times a particular SDValue appears in the
3972  // element list.
3973  DenseMap<SDValue, unsigned> ValueCounts;
3974  SDValue Value;
3975  for (unsigned i = 0; i < NumElts; ++i) {
3976    SDValue V = Op.getOperand(i);
3977    if (V.getOpcode() == ISD::UNDEF)
3978      continue;
3979    if (i > 0)
3980      isOnlyLowElement = false;
3981    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
3982      isConstant = false;
3983
3984    ValueCounts.insert(std::make_pair(V, 0));
3985    unsigned &Count = ValueCounts[V];
3986
3987    // Is this value dominant? (takes up more than half of the lanes)
3988    if (++Count > (NumElts / 2)) {
3989      hasDominantValue = true;
3990      Value = V;
3991    }
3992  }
3993  if (ValueCounts.size() != 1)
3994    usesOnlyOneValue = false;
3995  if (!Value.getNode() && ValueCounts.size() > 0)
3996    Value = ValueCounts.begin()->first;
3997
3998  if (ValueCounts.size() == 0)
3999    return DAG.getUNDEF(VT);
4000
4001  // Loads are better lowered with insert_vector_elt.
4002  // Keep going if we are hitting this case.
4003  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
4004    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
4005
4006  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4007  if (hasDominantValue && EltSize <= 64) {
4008    // Use VDUP for non-constant splats.
4009    if (!isConstant) {
4010      SDValue N;
4011
4012      // If we are DUPing a value that comes directly from a vector, we could
4013      // just use DUPLANE. We can only do this if the lane being extracted
4014      // is at a constant index, as the DUP from lane instructions only have
4015      // constant-index forms.
4016      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4017          isa<ConstantSDNode>(Value->getOperand(1))) {
4018          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
4019                        Value->getOperand(0), Value->getOperand(1));
4020      } else
4021        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4022
4023      if (!usesOnlyOneValue) {
4024        // The dominant value was splatted as 'N', but we now have to insert
4025        // all differing elements.
4026        for (unsigned I = 0; I < NumElts; ++I) {
4027          if (Op.getOperand(I) == Value)
4028            continue;
4029          SmallVector<SDValue, 3> Ops;
4030          Ops.push_back(N);
4031          Ops.push_back(Op.getOperand(I));
4032          Ops.push_back(DAG.getConstant(I, MVT::i64));
4033          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
4034        }
4035      }
4036      return N;
4037    }
4038    if (usesOnlyOneValue && isConstant) {
4039      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
4040    }
4041  }
4042  // If all elements are constants and the case above didn't get hit, fall back
4043  // to the default expansion, which will generate a load from the constant
4044  // pool.
4045  if (isConstant)
4046    return SDValue();
4047
4048  // Try to lower this in lowering ShuffleVector way.
4049  SDValue Shuf;
4050  if (isKnownShuffleVector(Op, DAG, Shuf))
4051    return Shuf;
4052
4053  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
4054  // know the default expansion would otherwise fall back on something even
4055  // worse. For a vector with one or two non-undef values, that's
4056  // scalar_to_vector for the elements followed by a shuffle (provided the
4057  // shuffle is valid for the target) and materialization element by element
4058  // on the stack followed by a load for everything else.
4059  if (!isConstant && !usesOnlyOneValue) {
4060    SDValue Vec = DAG.getUNDEF(VT);
4061    for (unsigned i = 0 ; i < NumElts; ++i) {
4062      SDValue V = Op.getOperand(i);
4063      if (V.getOpcode() == ISD::UNDEF)
4064        continue;
4065      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
4066      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
4067    }
4068    return Vec;
4069  }
4070  return SDValue();
4071}
4072
4073/// isREVMask - Check if a vector shuffle corresponds to a REV
4074/// instruction with the specified blocksize.  (The order of the elements
4075/// within each block of the vector is reversed.)
4076static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4077  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
4078         "Only possible block sizes for REV are: 16, 32, 64");
4079
4080  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4081  if (EltSz == 64)
4082    return false;
4083
4084  unsigned NumElts = VT.getVectorNumElements();
4085  unsigned BlockElts = M[0] + 1;
4086  // If the first shuffle index is UNDEF, be optimistic.
4087  if (M[0] < 0)
4088    BlockElts = BlockSize / EltSz;
4089
4090  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4091    return false;
4092
4093  for (unsigned i = 0; i < NumElts; ++i) {
4094    if (M[i] < 0)
4095      continue; // ignore UNDEF indices
4096    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
4097      return false;
4098  }
4099
4100  return true;
4101}
4102
4103// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
4104// TRN instruction.
4105static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
4106  unsigned NumElts = VT.getVectorNumElements();
4107  if (NumElts < 4)
4108    return 0;
4109
4110  bool ismatch = true;
4111
4112  // Check UZP1
4113  for (unsigned i = 0; i < NumElts; ++i) {
4114    if ((unsigned)M[i] != i * 2) {
4115      ismatch = false;
4116      break;
4117    }
4118  }
4119  if (ismatch)
4120    return AArch64ISD::NEON_UZP1;
4121
4122  // Check UZP2
4123  ismatch = true;
4124  for (unsigned i = 0; i < NumElts; ++i) {
4125    if ((unsigned)M[i] != i * 2 + 1) {
4126      ismatch = false;
4127      break;
4128    }
4129  }
4130  if (ismatch)
4131    return AArch64ISD::NEON_UZP2;
4132
4133  // Check ZIP1
4134  ismatch = true;
4135  for (unsigned i = 0; i < NumElts; ++i) {
4136    if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
4137      ismatch = false;
4138      break;
4139    }
4140  }
4141  if (ismatch)
4142    return AArch64ISD::NEON_ZIP1;
4143
4144  // Check ZIP2
4145  ismatch = true;
4146  for (unsigned i = 0; i < NumElts; ++i) {
4147    if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
4148      ismatch = false;
4149      break;
4150    }
4151  }
4152  if (ismatch)
4153    return AArch64ISD::NEON_ZIP2;
4154
4155  // Check TRN1
4156  ismatch = true;
4157  for (unsigned i = 0; i < NumElts; ++i) {
4158    if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
4159      ismatch = false;
4160      break;
4161    }
4162  }
4163  if (ismatch)
4164    return AArch64ISD::NEON_TRN1;
4165
4166  // Check TRN2
4167  ismatch = true;
4168  for (unsigned i = 0; i < NumElts; ++i) {
4169    if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
4170      ismatch = false;
4171      break;
4172    }
4173  }
4174  if (ismatch)
4175    return AArch64ISD::NEON_TRN2;
4176
4177  return 0;
4178}
4179
4180SDValue
4181AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
4182                                           SelectionDAG &DAG) const {
4183  SDValue V1 = Op.getOperand(0);
4184  SDValue V2 = Op.getOperand(1);
4185  SDLoc dl(Op);
4186  EVT VT = Op.getValueType();
4187  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4188
4189  // Convert shuffles that are directly supported on NEON to target-specific
4190  // DAG nodes, instead of keeping them as shuffles and matching them again
4191  // during code selection.  This is more efficient and avoids the possibility
4192  // of inconsistencies between legalization and selection.
4193  ArrayRef<int> ShuffleMask = SVN->getMask();
4194
4195  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4196  if (EltSize > 64)
4197    return SDValue();
4198
4199  if (isREVMask(ShuffleMask, VT, 64))
4200    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
4201  if (isREVMask(ShuffleMask, VT, 32))
4202    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
4203  if (isREVMask(ShuffleMask, VT, 16))
4204    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
4205
4206  unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
4207  if (ISDNo)
4208    return DAG.getNode(ISDNo, dl, VT, V1, V2);
4209
4210  // If the element of shuffle mask are all the same constant, we can
4211  // transform it into either NEON_VDUP or NEON_VDUPLANE
4212  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4213    int Lane = SVN->getSplatIndex();
4214    // If this is undef splat, generate it via "just" vdup, if possible.
4215    if (Lane == -1) Lane = 0;
4216
4217    // Test if V1 is a SCALAR_TO_VECTOR.
4218    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4219      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
4220    }
4221    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
4222    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
4223      bool IsScalarToVector = true;
4224      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
4225        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
4226            i != (unsigned)Lane) {
4227          IsScalarToVector = false;
4228          break;
4229        }
4230      if (IsScalarToVector)
4231        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
4232                           V1.getOperand(Lane));
4233    }
4234
4235    // Test if V1 is a EXTRACT_SUBVECTOR.
4236    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4237      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
4238      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
4239                         DAG.getConstant(Lane + ExtLane, MVT::i64));
4240    }
4241    // Test if V1 is a CONCAT_VECTORS.
4242    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
4243        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
4244      SDValue Op0 = V1.getOperand(0);
4245      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
4246             "Invalid vector lane access");
4247      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
4248                         DAG.getConstant(Lane, MVT::i64));
4249    }
4250
4251    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
4252                       DAG.getConstant(Lane, MVT::i64));
4253  }
4254
4255  int Length = ShuffleMask.size();
4256  int V1EltNum = V1.getValueType().getVectorNumElements();
4257
4258  // If the number of v1 elements is the same as the number of shuffle mask
4259  // element and the shuffle masks are sequential values, we can transform
4260  // it into NEON_VEXTRACT.
4261  if (V1EltNum == Length) {
4262    // Check if the shuffle mask is sequential.
4263    bool IsSequential = true;
4264    int CurMask = ShuffleMask[0];
4265    for (int I = 0; I < Length; ++I) {
4266      if (ShuffleMask[I] != CurMask) {
4267        IsSequential = false;
4268        break;
4269      }
4270      CurMask++;
4271    }
4272    if (IsSequential) {
4273      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
4274      unsigned VecSize = EltSize * V1EltNum;
4275      unsigned Index = (EltSize/8) * ShuffleMask[0];
4276      if (VecSize == 64 || VecSize == 128)
4277        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
4278                           DAG.getConstant(Index, MVT::i64));
4279    }
4280  }
4281
4282  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
4283  // by element from V2 to V1 .
4284  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
4285  // better choice to be inserted than V1 as less insert needed, so we count
4286  // element to be inserted for both V1 and V2, and select less one as insert
4287  // target.
4288
4289  // Collect elements need to be inserted and their index.
4290  SmallVector<int, 8> NV1Elt;
4291  SmallVector<int, 8> N1Index;
4292  SmallVector<int, 8> NV2Elt;
4293  SmallVector<int, 8> N2Index;
4294  for (int I = 0; I != Length; ++I) {
4295    if (ShuffleMask[I] != I) {
4296      NV1Elt.push_back(ShuffleMask[I]);
4297      N1Index.push_back(I);
4298    }
4299  }
4300  for (int I = 0; I != Length; ++I) {
4301    if (ShuffleMask[I] != (I + V1EltNum)) {
4302      NV2Elt.push_back(ShuffleMask[I]);
4303      N2Index.push_back(I);
4304    }
4305  }
4306
4307  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
4308  // will be inserted.
4309  SDValue InsV = V1;
4310  SmallVector<int, 8> InsMasks = NV1Elt;
4311  SmallVector<int, 8> InsIndex = N1Index;
4312  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
4313    if (NV1Elt.size() > NV2Elt.size()) {
4314      InsV = V2;
4315      InsMasks = NV2Elt;
4316      InsIndex = N2Index;
4317    }
4318  } else {
4319    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
4320  }
4321
4322  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
4323    SDValue ExtV = V1;
4324    int Mask = InsMasks[I];
4325    if (Mask >= V1EltNum) {
4326      ExtV = V2;
4327      Mask -= V1EltNum;
4328    }
4329    // Any value type smaller than i32 is illegal in AArch64, and this lower
4330    // function is called after legalize pass, so we need to legalize
4331    // the result here.
4332    EVT EltVT;
4333    if (VT.getVectorElementType().isFloatingPoint())
4334      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
4335    else
4336      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
4337
4338    if (Mask >= 0) {
4339      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
4340                         DAG.getConstant(Mask, MVT::i64));
4341      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
4342                         DAG.getConstant(InsIndex[I], MVT::i64));
4343    }
4344  }
4345  return InsV;
4346}
4347
4348AArch64TargetLowering::ConstraintType
4349AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
4350  if (Constraint.size() == 1) {
4351    switch (Constraint[0]) {
4352    default: break;
4353    case 'w': // An FP/SIMD vector register
4354      return C_RegisterClass;
4355    case 'I': // Constant that can be used with an ADD instruction
4356    case 'J': // Constant that can be used with a SUB instruction
4357    case 'K': // Constant that can be used with a 32-bit logical instruction
4358    case 'L': // Constant that can be used with a 64-bit logical instruction
4359    case 'M': // Constant that can be used as a 32-bit MOV immediate
4360    case 'N': // Constant that can be used as a 64-bit MOV immediate
4361    case 'Y': // Floating point constant zero
4362    case 'Z': // Integer constant zero
4363      return C_Other;
4364    case 'Q': // A memory reference with base register and no offset
4365      return C_Memory;
4366    case 'S': // A symbolic address
4367      return C_Other;
4368    }
4369  }
4370
4371  // FIXME: Ump, Utf, Usa, Ush
4372  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
4373  //      whatever they may be
4374  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
4375  // Usa: An absolute symbolic address
4376  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
4377  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
4378         && Constraint != "Ush" && "Unimplemented constraints");
4379
4380  return TargetLowering::getConstraintType(Constraint);
4381}
4382
4383TargetLowering::ConstraintWeight
4384AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
4385                                                const char *Constraint) const {
4386
4387  llvm_unreachable("Constraint weight unimplemented");
4388}
4389
4390void
4391AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
4392                                                    std::string &Constraint,
4393                                                    std::vector<SDValue> &Ops,
4394                                                    SelectionDAG &DAG) const {
4395  SDValue Result(0, 0);
4396
4397  // Only length 1 constraints are C_Other.
4398  if (Constraint.size() != 1) return;
4399
4400  // Only C_Other constraints get lowered like this. That means constants for us
4401  // so return early if there's no hope the constraint can be lowered.
4402
4403  switch(Constraint[0]) {
4404  default: break;
4405  case 'I': case 'J': case 'K': case 'L':
4406  case 'M': case 'N': case 'Z': {
4407    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
4408    if (!C)
4409      return;
4410
4411    uint64_t CVal = C->getZExtValue();
4412    uint32_t Bits;
4413
4414    switch (Constraint[0]) {
4415    default:
4416      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
4417      // is a peculiarly useless SUB constraint.
4418      llvm_unreachable("Unimplemented C_Other constraint");
4419    case 'I':
4420      if (CVal <= 0xfff)
4421        break;
4422      return;
4423    case 'K':
4424      if (A64Imms::isLogicalImm(32, CVal, Bits))
4425        break;
4426      return;
4427    case 'L':
4428      if (A64Imms::isLogicalImm(64, CVal, Bits))
4429        break;
4430      return;
4431    case 'Z':
4432      if (CVal == 0)
4433        break;
4434      return;
4435    }
4436
4437    Result = DAG.getTargetConstant(CVal, Op.getValueType());
4438    break;
4439  }
4440  case 'S': {
4441    // An absolute symbolic address or label reference.
4442    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
4443      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
4444                                          GA->getValueType(0));
4445    } else if (const BlockAddressSDNode *BA
4446                 = dyn_cast<BlockAddressSDNode>(Op)) {
4447      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
4448                                         BA->getValueType(0));
4449    } else if (const ExternalSymbolSDNode *ES
4450                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
4451      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
4452                                           ES->getValueType(0));
4453    } else
4454      return;
4455    break;
4456  }
4457  case 'Y':
4458    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
4459      if (CFP->isExactlyValue(0.0)) {
4460        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
4461        break;
4462      }
4463    }
4464    return;
4465  }
4466
4467  if (Result.getNode()) {
4468    Ops.push_back(Result);
4469    return;
4470  }
4471
4472  // It's an unknown constraint for us. Let generic code have a go.
4473  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
4474}
4475
4476std::pair<unsigned, const TargetRegisterClass*>
4477AArch64TargetLowering::getRegForInlineAsmConstraint(
4478                                                  const std::string &Constraint,
4479                                                  MVT VT) const {
4480  if (Constraint.size() == 1) {
4481    switch (Constraint[0]) {
4482    case 'r':
4483      if (VT.getSizeInBits() <= 32)
4484        return std::make_pair(0U, &AArch64::GPR32RegClass);
4485      else if (VT == MVT::i64)
4486        return std::make_pair(0U, &AArch64::GPR64RegClass);
4487      break;
4488    case 'w':
4489      if (VT == MVT::f16)
4490        return std::make_pair(0U, &AArch64::FPR16RegClass);
4491      else if (VT == MVT::f32)
4492        return std::make_pair(0U, &AArch64::FPR32RegClass);
4493      else if (VT.getSizeInBits() == 64)
4494        return std::make_pair(0U, &AArch64::FPR64RegClass);
4495      else if (VT.getSizeInBits() == 128)
4496        return std::make_pair(0U, &AArch64::FPR128RegClass);
4497      break;
4498    }
4499  }
4500
4501  // Use the default implementation in TargetLowering to convert the register
4502  // constraint into a member of a register class.
4503  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
4504}
4505
4506/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
4507/// The associated MachineMemOperands record the alignment specified
4508/// in the intrinsic calls.
4509bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4510                                               const CallInst &I,
4511                                               unsigned Intrinsic) const {
4512  switch (Intrinsic) {
4513  case Intrinsic::arm_neon_vld1:
4514  case Intrinsic::arm_neon_vld2:
4515  case Intrinsic::arm_neon_vld3:
4516  case Intrinsic::arm_neon_vld4:
4517  case Intrinsic::aarch64_neon_vld1x2:
4518  case Intrinsic::aarch64_neon_vld1x3:
4519  case Intrinsic::aarch64_neon_vld1x4:
4520  case Intrinsic::arm_neon_vld2lane:
4521  case Intrinsic::arm_neon_vld3lane:
4522  case Intrinsic::arm_neon_vld4lane: {
4523    Info.opc = ISD::INTRINSIC_W_CHAIN;
4524    // Conservatively set memVT to the entire set of vectors loaded.
4525    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
4526    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4527    Info.ptrVal = I.getArgOperand(0);
4528    Info.offset = 0;
4529    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4530    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4531    Info.vol = false; // volatile loads with NEON intrinsics not supported
4532    Info.readMem = true;
4533    Info.writeMem = false;
4534    return true;
4535  }
4536  case Intrinsic::arm_neon_vst1:
4537  case Intrinsic::arm_neon_vst2:
4538  case Intrinsic::arm_neon_vst3:
4539  case Intrinsic::arm_neon_vst4:
4540  case Intrinsic::aarch64_neon_vst1x2:
4541  case Intrinsic::aarch64_neon_vst1x3:
4542  case Intrinsic::aarch64_neon_vst1x4:
4543  case Intrinsic::arm_neon_vst2lane:
4544  case Intrinsic::arm_neon_vst3lane:
4545  case Intrinsic::arm_neon_vst4lane: {
4546    Info.opc = ISD::INTRINSIC_VOID;
4547    // Conservatively set memVT to the entire set of vectors stored.
4548    unsigned NumElts = 0;
4549    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
4550      Type *ArgTy = I.getArgOperand(ArgI)->getType();
4551      if (!ArgTy->isVectorTy())
4552        break;
4553      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
4554    }
4555    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
4556    Info.ptrVal = I.getArgOperand(0);
4557    Info.offset = 0;
4558    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
4559    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
4560    Info.vol = false; // volatile stores with NEON intrinsics not supported
4561    Info.readMem = false;
4562    Info.writeMem = true;
4563    return true;
4564  }
4565  default:
4566    break;
4567  }
4568
4569  return false;
4570}
4571