ARMISelLowering.cpp revision 32c2bfda77d54ca6ad8e08d2de03daa7ae432305
1//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that ARM uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-isel"
16#include "ARMISelLowering.h"
17#include "ARM.h"
18#include "ARMCallingConv.h"
19#include "ARMConstantPoolValue.h"
20#include "ARMMachineFunctionInfo.h"
21#include "ARMPerfectShuffle.h"
22#include "ARMSubtarget.h"
23#include "ARMTargetMachine.h"
24#include "ARMTargetObjectFile.h"
25#include "MCTargetDesc/ARMAddressingModes.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/CodeGen/CallingConvLower.h"
29#include "llvm/CodeGen/IntrinsicLowering.h"
30#include "llvm/CodeGen/MachineBasicBlock.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/SelectionDAG.h"
37#include "llvm/IR/CallingConv.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/Function.h"
40#include "llvm/IR/GlobalValue.h"
41#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Instructions.h"
43#include "llvm/IR/Intrinsics.h"
44#include "llvm/IR/Type.h"
45#include "llvm/MC/MCSectionMachO.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/ErrorHandling.h"
48#include "llvm/Support/MathExtras.h"
49#include "llvm/Support/raw_ostream.h"
50#include "llvm/Target/TargetOptions.h"
51using namespace llvm;
52
53STATISTIC(NumTailCalls, "Number of tail calls");
54STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
55STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
56
57// This option should go away when tail calls fully work.
58static cl::opt<bool>
59EnableARMTailCalls("arm-tail-calls", cl::Hidden,
60  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
61  cl::init(false));
62
63cl::opt<bool>
64EnableARMLongCalls("arm-long-calls", cl::Hidden,
65  cl::desc("Generate calls via indirect call instructions"),
66  cl::init(false));
67
68static cl::opt<bool>
69ARMInterworking("arm-interworking", cl::Hidden,
70  cl::desc("Enable / disable ARM interworking (for debugging only)"),
71  cl::init(true));
72
73namespace {
74  class ARMCCState : public CCState {
75  public:
76    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
77               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
78               LLVMContext &C, ParmContext PC)
79        : CCState(CC, isVarArg, MF, TM, locs, C) {
80      assert(((PC == Call) || (PC == Prologue)) &&
81             "ARMCCState users must specify whether their context is call"
82             "or prologue generation.");
83      CallOrPrologue = PC;
84    }
85  };
86}
87
88// The APCS parameter registers.
89static const uint16_t GPRArgRegs[] = {
90  ARM::R0, ARM::R1, ARM::R2, ARM::R3
91};
92
93void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
94                                       MVT PromotedBitwiseVT) {
95  if (VT != PromotedLdStVT) {
96    setOperationAction(ISD::LOAD, VT, Promote);
97    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
98
99    setOperationAction(ISD::STORE, VT, Promote);
100    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
101  }
102
103  MVT ElemTy = VT.getVectorElementType();
104  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
105    setOperationAction(ISD::SETCC, VT, Custom);
106  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
107  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
108  if (ElemTy == MVT::i32) {
109    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
110    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
111    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
112    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
113  } else {
114    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
115    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
116    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
117    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
118  }
119  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
120  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
121  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
122  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
123  setOperationAction(ISD::SELECT,            VT, Expand);
124  setOperationAction(ISD::SELECT_CC,         VT, Expand);
125  setOperationAction(ISD::VSELECT,           VT, Expand);
126  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
127  if (VT.isInteger()) {
128    setOperationAction(ISD::SHL, VT, Custom);
129    setOperationAction(ISD::SRA, VT, Custom);
130    setOperationAction(ISD::SRL, VT, Custom);
131  }
132
133  // Promote all bit-wise operations.
134  if (VT.isInteger() && VT != PromotedBitwiseVT) {
135    setOperationAction(ISD::AND, VT, Promote);
136    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
137    setOperationAction(ISD::OR,  VT, Promote);
138    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
139    setOperationAction(ISD::XOR, VT, Promote);
140    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
141  }
142
143  // Neon does not support vector divide/remainder operations.
144  setOperationAction(ISD::SDIV, VT, Expand);
145  setOperationAction(ISD::UDIV, VT, Expand);
146  setOperationAction(ISD::FDIV, VT, Expand);
147  setOperationAction(ISD::SREM, VT, Expand);
148  setOperationAction(ISD::UREM, VT, Expand);
149  setOperationAction(ISD::FREM, VT, Expand);
150}
151
152void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
153  addRegisterClass(VT, &ARM::DPRRegClass);
154  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
155}
156
157void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
158  addRegisterClass(VT, &ARM::QPRRegClass);
159  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
160}
161
162static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
163  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
164    return new TargetLoweringObjectFileMachO();
165
166  return new ARMElfTargetObjectFile();
167}
168
169ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
170    : TargetLowering(TM, createTLOF(TM)) {
171  Subtarget = &TM.getSubtarget<ARMSubtarget>();
172  RegInfo = TM.getRegisterInfo();
173  Itins = TM.getInstrItineraryData();
174
175  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
176
177  if (Subtarget->isTargetDarwin()) {
178    // Uses VFP for Thumb libfuncs if available.
179    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
180      // Single-precision floating-point arithmetic.
181      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
182      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
183      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
184      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
185
186      // Double-precision floating-point arithmetic.
187      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
188      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
189      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
190      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
191
192      // Single-precision comparisons.
193      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
194      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
195      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
196      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
197      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
198      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
199      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
200      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
201
202      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
203      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
204      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
205      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
206      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
207      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
208      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
209      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
210
211      // Double-precision comparisons.
212      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
213      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
214      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
215      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
216      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
217      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
218      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
219      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
220
221      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
222      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
223      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
224      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
225      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
226      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
227      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
228      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
229
230      // Floating-point to integer conversions.
231      // i64 conversions are done via library routines even when generating VFP
232      // instructions, so use the same ones.
233      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
234      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
235      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
236      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
237
238      // Conversions between floating types.
239      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
240      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
241
242      // Integer to floating-point conversions.
243      // i64 conversions are done via library routines even when generating VFP
244      // instructions, so use the same ones.
245      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
246      // e.g., __floatunsidf vs. __floatunssidfvfp.
247      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
248      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
249      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
250      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
251    }
252  }
253
254  // These libcalls are not available in 32-bit.
255  setLibcallName(RTLIB::SHL_I128, 0);
256  setLibcallName(RTLIB::SRL_I128, 0);
257  setLibcallName(RTLIB::SRA_I128, 0);
258
259  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
260    // Double-precision floating-point arithmetic helper functions
261    // RTABI chapter 4.1.2, Table 2
262    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
263    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
264    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
265    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
266    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
267    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
268    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
269    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
270
271    // Double-precision floating-point comparison helper functions
272    // RTABI chapter 4.1.2, Table 3
273    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
274    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
275    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
276    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
277    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
278    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
279    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
280    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
281    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
282    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
283    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
284    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
285    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
286    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
287    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
288    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
289    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
290    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
291    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
292    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
293    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
294    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
295    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
296    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
297
298    // Single-precision floating-point arithmetic helper functions
299    // RTABI chapter 4.1.2, Table 4
300    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
301    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
302    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
303    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
304    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
305    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
306    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
307    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
308
309    // Single-precision floating-point comparison helper functions
310    // RTABI chapter 4.1.2, Table 5
311    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
312    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
313    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
314    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
315    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
316    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
317    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
318    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
319    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
320    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
321    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
322    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
323    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
324    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
325    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
326    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
327    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
328    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
329    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
330    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
331    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
332    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
333    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
334    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
335
336    // Floating-point to integer conversions.
337    // RTABI chapter 4.1.2, Table 6
338    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
339    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
340    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
341    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
342    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
343    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
344    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
345    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
346    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
347    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
348    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
349    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
350    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
351    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
352    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
353    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
354
355    // Conversions between floating types.
356    // RTABI chapter 4.1.2, Table 7
357    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
358    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
359    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
360    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
361
362    // Integer to floating-point conversions.
363    // RTABI chapter 4.1.2, Table 8
364    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
365    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
366    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
367    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
368    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
369    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
370    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
371    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
372    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
373    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
374    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
375    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
376    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
377    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
378    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
379    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
380
381    // Long long helper functions
382    // RTABI chapter 4.2, Table 9
383    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
384    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
385    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
386    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
387    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
388    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
389    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
390    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
391    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
392    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
393
394    // Integer division functions
395    // RTABI chapter 4.3.1
396    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
397    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
398    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
399    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
400    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
401    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
402    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
403    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
404    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
405    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
406    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
407    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
408    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
409    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
410    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
411    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
412
413    // Memory operations
414    // RTABI chapter 4.3.4
415    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
416    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
417    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
418    setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
419    setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
420    setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
421  }
422
423  // Use divmod compiler-rt calls for iOS 5.0 and later.
424  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
425      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
426    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
427    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
428  }
429
430  if (Subtarget->isThumb1Only())
431    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
432  else
433    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
434  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
435      !Subtarget->isThumb1Only()) {
436    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
437    if (!Subtarget->isFPOnlySP())
438      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
439
440    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
441  }
442
443  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
444       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
445    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
446         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
447      setTruncStoreAction((MVT::SimpleValueType)VT,
448                          (MVT::SimpleValueType)InnerVT, Expand);
449    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
450    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
451    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
452  }
453
454  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
455  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
456
457  if (Subtarget->hasNEON()) {
458    addDRTypeForNEON(MVT::v2f32);
459    addDRTypeForNEON(MVT::v8i8);
460    addDRTypeForNEON(MVT::v4i16);
461    addDRTypeForNEON(MVT::v2i32);
462    addDRTypeForNEON(MVT::v1i64);
463
464    addQRTypeForNEON(MVT::v4f32);
465    addQRTypeForNEON(MVT::v2f64);
466    addQRTypeForNEON(MVT::v16i8);
467    addQRTypeForNEON(MVT::v8i16);
468    addQRTypeForNEON(MVT::v4i32);
469    addQRTypeForNEON(MVT::v2i64);
470
471    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
472    // neither Neon nor VFP support any arithmetic operations on it.
473    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
474    // supported for v4f32.
475    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
476    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
477    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
478    // FIXME: Code duplication: FDIV and FREM are expanded always, see
479    // ARMTargetLowering::addTypeForNEON method for details.
480    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
481    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
482    // FIXME: Create unittest.
483    // In another words, find a way when "copysign" appears in DAG with vector
484    // operands.
485    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
486    // FIXME: Code duplication: SETCC has custom operation action, see
487    // ARMTargetLowering::addTypeForNEON method for details.
488    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
489    // FIXME: Create unittest for FNEG and for FABS.
490    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
491    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
492    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
493    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
494    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
495    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
496    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
497    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
498    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
499    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
500    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
501    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
502    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
503    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
504    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
505    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
506    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
507    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
508    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
509
510    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
511    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
512    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
513    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
514    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
515    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
516    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
517    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
518    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
519    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
520    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
521    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
522    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
523    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
524    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
525
526    // Mark v2f32 intrinsics.
527    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
528    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
529    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
530    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
531    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
532    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
533    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
534    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
535    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
536    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
537    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
538    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
539    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
540    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
541    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
542
543    // Neon does not support some operations on v1i64 and v2i64 types.
544    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
545    // Custom handling for some quad-vector types to detect VMULL.
546    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
547    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
548    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
549    // Custom handling for some vector types to avoid expensive expansions
550    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
551    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
552    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
553    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
554    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
555    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
556    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
557    // a destination type that is wider than the source, and nor does
558    // it have a FP_TO_[SU]INT instruction with a narrower destination than
559    // source.
560    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
561    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
562    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
563    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
564
565    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
566    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
567
568    // Custom expand long extensions to vectors.
569    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
570    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
571    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
572    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
573    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
574    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
575    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
576    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
577
578    // NEON does not have single instruction CTPOP for vectors with element
579    // types wider than 8-bits.  However, custom lowering can leverage the
580    // v8i8/v16i8 vcnt instruction.
581    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
582    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
583    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
584    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
585
586    // NEON only has FMA instructions as of VFP4.
587    if (!Subtarget->hasVFP4()) {
588      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
589      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
590    }
591
592    setTargetDAGCombine(ISD::INTRINSIC_VOID);
593    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
594    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
595    setTargetDAGCombine(ISD::SHL);
596    setTargetDAGCombine(ISD::SRL);
597    setTargetDAGCombine(ISD::SRA);
598    setTargetDAGCombine(ISD::SIGN_EXTEND);
599    setTargetDAGCombine(ISD::ZERO_EXTEND);
600    setTargetDAGCombine(ISD::ANY_EXTEND);
601    setTargetDAGCombine(ISD::SELECT_CC);
602    setTargetDAGCombine(ISD::BUILD_VECTOR);
603    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
604    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
605    setTargetDAGCombine(ISD::STORE);
606    setTargetDAGCombine(ISD::FP_TO_SINT);
607    setTargetDAGCombine(ISD::FP_TO_UINT);
608    setTargetDAGCombine(ISD::FDIV);
609
610    // It is legal to extload from v4i8 to v4i16 or v4i32.
611    MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
612                  MVT::v4i16, MVT::v2i16,
613                  MVT::v2i32};
614    for (unsigned i = 0; i < 6; ++i) {
615      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
616      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
617      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
618    }
619  }
620
621  // ARM and Thumb2 support UMLAL/SMLAL.
622  if (!Subtarget->isThumb1Only())
623    setTargetDAGCombine(ISD::ADDC);
624
625
626  computeRegisterProperties();
627
628  // ARM does not have f32 extending load.
629  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
630
631  // ARM does not have i1 sign extending load.
632  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
633
634  // ARM supports all 4 flavors of integer indexed load / store.
635  if (!Subtarget->isThumb1Only()) {
636    for (unsigned im = (unsigned)ISD::PRE_INC;
637         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
638      setIndexedLoadAction(im,  MVT::i1,  Legal);
639      setIndexedLoadAction(im,  MVT::i8,  Legal);
640      setIndexedLoadAction(im,  MVT::i16, Legal);
641      setIndexedLoadAction(im,  MVT::i32, Legal);
642      setIndexedStoreAction(im, MVT::i1,  Legal);
643      setIndexedStoreAction(im, MVT::i8,  Legal);
644      setIndexedStoreAction(im, MVT::i16, Legal);
645      setIndexedStoreAction(im, MVT::i32, Legal);
646    }
647  }
648
649  // i64 operation support.
650  setOperationAction(ISD::MUL,     MVT::i64, Expand);
651  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
652  if (Subtarget->isThumb1Only()) {
653    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
654    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
655  }
656  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
657      || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
658    setOperationAction(ISD::MULHS, MVT::i32, Expand);
659
660  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
661  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
662  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
663  setOperationAction(ISD::SRL,       MVT::i64, Custom);
664  setOperationAction(ISD::SRA,       MVT::i64, Custom);
665
666  if (!Subtarget->isThumb1Only()) {
667    // FIXME: We should do this for Thumb1 as well.
668    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
669    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
670    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
671    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
672  }
673
674  // ARM does not have ROTL.
675  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
676  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
677  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
678  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
679    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
680
681  // These just redirect to CTTZ and CTLZ on ARM.
682  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
683  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
684
685  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
686
687  // Only ARMv6 has BSWAP.
688  if (!Subtarget->hasV6Ops())
689    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
690
691  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
692      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
693    // These are expanded into libcalls if the cpu doesn't have HW divider.
694    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
695    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
696  }
697
698  // FIXME: Also set divmod for SREM on EABI
699  setOperationAction(ISD::SREM,  MVT::i32, Expand);
700  setOperationAction(ISD::UREM,  MVT::i32, Expand);
701  // Register based DivRem for AEABI (RTABI 4.2)
702  if (Subtarget->isTargetAEABI()) {
703    setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
704    setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
705    setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
706    setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
707    setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
708    setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
709    setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
710    setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
711
712    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
713    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
714    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
715    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
716    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
717    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
718    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
719    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
720
721    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
722    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
723  } else {
724    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
725    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
726  }
727
728  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
729  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
730  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
731  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
732  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
733
734  setOperationAction(ISD::TRAP, MVT::Other, Legal);
735
736  // Use the default implementation.
737  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
738  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
739  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
740  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
741  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
742  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
743
744  if (!Subtarget->isTargetDarwin()) {
745    // Non-Darwin platforms may return values in these registers via the
746    // personality function.
747    setExceptionPointerRegister(ARM::R0);
748    setExceptionSelectorRegister(ARM::R1);
749  }
750
751  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
752  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
753  // the default expansion.
754  // FIXME: This should be checking for v6k, not just v6.
755  if (Subtarget->hasDataBarrier() ||
756      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
757    // membarrier needs custom lowering; the rest are legal and handled
758    // normally.
759    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
760    // Custom lowering for 64-bit ops
761    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
762    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
763    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
764    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
765    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
766    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
767    setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
768    setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
769    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
770    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
771    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
772    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
773    setInsertFencesForAtomic(true);
774  } else {
775    // Set them all for expansion, which will force libcalls.
776    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
777    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
778    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
779    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
780    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
781    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
782    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
783    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
784    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
785    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
786    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
787    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
788    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
789    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
790    // Unordered/Monotonic case.
791    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
792    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
793  }
794
795  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
796
797  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
798  if (!Subtarget->hasV6Ops()) {
799    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
800    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
801  }
802  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
803
804  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
805      !Subtarget->isThumb1Only()) {
806    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
807    // iff target supports vfp2.
808    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
809    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
810  }
811
812  // We want to custom lower some of our intrinsics.
813  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
814  if (Subtarget->isTargetDarwin()) {
815    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
816    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
817    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
818  }
819
820  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
821  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
822  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
823  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
824  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
825  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
826  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
827  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
828  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
829
830  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
831  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
832  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
833  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
834  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
835
836  // We don't support sin/cos/fmod/copysign/pow
837  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
838  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
839  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
840  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
841  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
842  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
843  setOperationAction(ISD::FREM,      MVT::f64, Expand);
844  setOperationAction(ISD::FREM,      MVT::f32, Expand);
845  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
846      !Subtarget->isThumb1Only()) {
847    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
848    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
849  }
850  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
851  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
852
853  if (!Subtarget->hasVFP4()) {
854    setOperationAction(ISD::FMA, MVT::f64, Expand);
855    setOperationAction(ISD::FMA, MVT::f32, Expand);
856  }
857
858  // Various VFP goodness
859  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
860    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
861    if (Subtarget->hasVFP2()) {
862      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
863      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
864      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
865      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
866    }
867    // Special handling for half-precision FP.
868    if (!Subtarget->hasFP16()) {
869      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
870      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
871    }
872  }
873
874  // We have target-specific dag combine patterns for the following nodes:
875  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
876  setTargetDAGCombine(ISD::ADD);
877  setTargetDAGCombine(ISD::SUB);
878  setTargetDAGCombine(ISD::MUL);
879  setTargetDAGCombine(ISD::AND);
880  setTargetDAGCombine(ISD::OR);
881  setTargetDAGCombine(ISD::XOR);
882
883  if (Subtarget->hasV6Ops())
884    setTargetDAGCombine(ISD::SRL);
885
886  setStackPointerRegisterToSaveRestore(ARM::SP);
887
888  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
889      !Subtarget->hasVFP2())
890    setSchedulingPreference(Sched::RegPressure);
891  else
892    setSchedulingPreference(Sched::Hybrid);
893
894  //// temporary - rewrite interface to use type
895  MaxStoresPerMemset = 8;
896  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
897  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
898  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
899  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
900  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
901
902  // On ARM arguments smaller than 4 bytes are extended, so all arguments
903  // are at least 4 bytes aligned.
904  setMinStackArgumentAlignment(4);
905
906  // Prefer likely predicted branches to selects on out-of-order cores.
907  PredictableSelectIsExpensive = Subtarget->isLikeA9();
908
909  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
910}
911
912// FIXME: It might make sense to define the representative register class as the
913// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
914// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
915// SPR's representative would be DPR_VFP2. This should work well if register
916// pressure tracking were modified such that a register use would increment the
917// pressure of the register class's representative and all of it's super
918// classes' representatives transitively. We have not implemented this because
919// of the difficulty prior to coalescing of modeling operand register classes
920// due to the common occurrence of cross class copies and subregister insertions
921// and extractions.
922std::pair<const TargetRegisterClass*, uint8_t>
923ARMTargetLowering::findRepresentativeClass(MVT VT) const{
924  const TargetRegisterClass *RRC = 0;
925  uint8_t Cost = 1;
926  switch (VT.SimpleTy) {
927  default:
928    return TargetLowering::findRepresentativeClass(VT);
929  // Use DPR as representative register class for all floating point
930  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
931  // the cost is 1 for both f32 and f64.
932  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
933  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
934    RRC = &ARM::DPRRegClass;
935    // When NEON is used for SP, only half of the register file is available
936    // because operations that define both SP and DP results will be constrained
937    // to the VFP2 class (D0-D15). We currently model this constraint prior to
938    // coalescing by double-counting the SP regs. See the FIXME above.
939    if (Subtarget->useNEONForSinglePrecisionFP())
940      Cost = 2;
941    break;
942  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
943  case MVT::v4f32: case MVT::v2f64:
944    RRC = &ARM::DPRRegClass;
945    Cost = 2;
946    break;
947  case MVT::v4i64:
948    RRC = &ARM::DPRRegClass;
949    Cost = 4;
950    break;
951  case MVT::v8i64:
952    RRC = &ARM::DPRRegClass;
953    Cost = 8;
954    break;
955  }
956  return std::make_pair(RRC, Cost);
957}
958
959const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
960  switch (Opcode) {
961  default: return 0;
962  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
963  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
964  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
965  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
966  case ARMISD::CALL:          return "ARMISD::CALL";
967  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
968  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
969  case ARMISD::tCALL:         return "ARMISD::tCALL";
970  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
971  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
972  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
973  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
974  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
975  case ARMISD::CMP:           return "ARMISD::CMP";
976  case ARMISD::CMN:           return "ARMISD::CMN";
977  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
978  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
979  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
980  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
981  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
982
983  case ARMISD::CMOV:          return "ARMISD::CMOV";
984
985  case ARMISD::RBIT:          return "ARMISD::RBIT";
986
987  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
988  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
989  case ARMISD::SITOF:         return "ARMISD::SITOF";
990  case ARMISD::UITOF:         return "ARMISD::UITOF";
991
992  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
993  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
994  case ARMISD::RRX:           return "ARMISD::RRX";
995
996  case ARMISD::ADDC:          return "ARMISD::ADDC";
997  case ARMISD::ADDE:          return "ARMISD::ADDE";
998  case ARMISD::SUBC:          return "ARMISD::SUBC";
999  case ARMISD::SUBE:          return "ARMISD::SUBE";
1000
1001  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1002  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1003
1004  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1005  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
1006
1007  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1008
1009  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1010
1011  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1012
1013  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
1014  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1015
1016  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1017
1018  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
1019  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
1020  case ARMISD::VCGE:          return "ARMISD::VCGE";
1021  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
1022  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
1023  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
1024  case ARMISD::VCGT:          return "ARMISD::VCGT";
1025  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
1026  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
1027  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
1028  case ARMISD::VTST:          return "ARMISD::VTST";
1029
1030  case ARMISD::VSHL:          return "ARMISD::VSHL";
1031  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
1032  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
1033  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
1034  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
1035  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
1036  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
1037  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
1038  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
1039  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
1040  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
1041  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
1042  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
1043  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
1044  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
1045  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
1046  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
1047  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
1048  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
1049  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1050  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1051  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1052  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1053  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1054  case ARMISD::VDUP:          return "ARMISD::VDUP";
1055  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1056  case ARMISD::VEXT:          return "ARMISD::VEXT";
1057  case ARMISD::VREV64:        return "ARMISD::VREV64";
1058  case ARMISD::VREV32:        return "ARMISD::VREV32";
1059  case ARMISD::VREV16:        return "ARMISD::VREV16";
1060  case ARMISD::VZIP:          return "ARMISD::VZIP";
1061  case ARMISD::VUZP:          return "ARMISD::VUZP";
1062  case ARMISD::VTRN:          return "ARMISD::VTRN";
1063  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1064  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1065  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1066  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1067  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1068  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1069  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1070  case ARMISD::FMAX:          return "ARMISD::FMAX";
1071  case ARMISD::FMIN:          return "ARMISD::FMIN";
1072  case ARMISD::BFI:           return "ARMISD::BFI";
1073  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1074  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1075  case ARMISD::VBSL:          return "ARMISD::VBSL";
1076  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1077  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1078  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1079  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1080  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1081  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1082  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1083  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1084  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1085  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1086  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1087  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1088  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1089  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1090  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1091  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1092  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1093  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1094  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1095  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1096
1097  case ARMISD::ATOMADD64_DAG:     return "ATOMADD64_DAG";
1098  case ARMISD::ATOMSUB64_DAG:     return "ATOMSUB64_DAG";
1099  case ARMISD::ATOMOR64_DAG:      return "ATOMOR64_DAG";
1100  case ARMISD::ATOMXOR64_DAG:     return "ATOMXOR64_DAG";
1101  case ARMISD::ATOMAND64_DAG:     return "ATOMAND64_DAG";
1102  case ARMISD::ATOMNAND64_DAG:    return "ATOMNAND64_DAG";
1103  case ARMISD::ATOMSWAP64_DAG:    return "ATOMSWAP64_DAG";
1104  case ARMISD::ATOMCMPXCHG64_DAG: return "ATOMCMPXCHG64_DAG";
1105  case ARMISD::ATOMMIN64_DAG:     return "ATOMMIN64_DAG";
1106  case ARMISD::ATOMUMIN64_DAG:    return "ATOMUMIN64_DAG";
1107  case ARMISD::ATOMMAX64_DAG:     return "ATOMMAX64_DAG";
1108  case ARMISD::ATOMUMAX64_DAG:    return "ATOMUMAX64_DAG";
1109  }
1110}
1111
1112EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1113  if (!VT.isVector()) return getPointerTy();
1114  return VT.changeVectorElementTypeToInteger();
1115}
1116
1117/// getRegClassFor - Return the register class that should be used for the
1118/// specified value type.
1119const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
1120  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1121  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1122  // load / store 4 to 8 consecutive D registers.
1123  if (Subtarget->hasNEON()) {
1124    if (VT == MVT::v4i64)
1125      return &ARM::QQPRRegClass;
1126    if (VT == MVT::v8i64)
1127      return &ARM::QQQQPRRegClass;
1128  }
1129  return TargetLowering::getRegClassFor(VT);
1130}
1131
1132// Create a fast isel object.
1133FastISel *
1134ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1135                                  const TargetLibraryInfo *libInfo) const {
1136  return ARM::createFastISel(funcInfo, libInfo);
1137}
1138
1139/// getMaximalGlobalOffset - Returns the maximal possible offset which can
1140/// be used for loads / stores from the global.
1141unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
1142  return (Subtarget->isThumb1Only() ? 127 : 4095);
1143}
1144
1145Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1146  unsigned NumVals = N->getNumValues();
1147  if (!NumVals)
1148    return Sched::RegPressure;
1149
1150  for (unsigned i = 0; i != NumVals; ++i) {
1151    EVT VT = N->getValueType(i);
1152    if (VT == MVT::Glue || VT == MVT::Other)
1153      continue;
1154    if (VT.isFloatingPoint() || VT.isVector())
1155      return Sched::ILP;
1156  }
1157
1158  if (!N->isMachineOpcode())
1159    return Sched::RegPressure;
1160
1161  // Load are scheduled for latency even if there instruction itinerary
1162  // is not available.
1163  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
1164  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1165
1166  if (MCID.getNumDefs() == 0)
1167    return Sched::RegPressure;
1168  if (!Itins->isEmpty() &&
1169      Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1170    return Sched::ILP;
1171
1172  return Sched::RegPressure;
1173}
1174
1175//===----------------------------------------------------------------------===//
1176// Lowering Code
1177//===----------------------------------------------------------------------===//
1178
1179/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1180static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1181  switch (CC) {
1182  default: llvm_unreachable("Unknown condition code!");
1183  case ISD::SETNE:  return ARMCC::NE;
1184  case ISD::SETEQ:  return ARMCC::EQ;
1185  case ISD::SETGT:  return ARMCC::GT;
1186  case ISD::SETGE:  return ARMCC::GE;
1187  case ISD::SETLT:  return ARMCC::LT;
1188  case ISD::SETLE:  return ARMCC::LE;
1189  case ISD::SETUGT: return ARMCC::HI;
1190  case ISD::SETUGE: return ARMCC::HS;
1191  case ISD::SETULT: return ARMCC::LO;
1192  case ISD::SETULE: return ARMCC::LS;
1193  }
1194}
1195
1196/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1197static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1198                        ARMCC::CondCodes &CondCode2) {
1199  CondCode2 = ARMCC::AL;
1200  switch (CC) {
1201  default: llvm_unreachable("Unknown FP condition!");
1202  case ISD::SETEQ:
1203  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1204  case ISD::SETGT:
1205  case ISD::SETOGT: CondCode = ARMCC::GT; break;
1206  case ISD::SETGE:
1207  case ISD::SETOGE: CondCode = ARMCC::GE; break;
1208  case ISD::SETOLT: CondCode = ARMCC::MI; break;
1209  case ISD::SETOLE: CondCode = ARMCC::LS; break;
1210  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1211  case ISD::SETO:   CondCode = ARMCC::VC; break;
1212  case ISD::SETUO:  CondCode = ARMCC::VS; break;
1213  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1214  case ISD::SETUGT: CondCode = ARMCC::HI; break;
1215  case ISD::SETUGE: CondCode = ARMCC::PL; break;
1216  case ISD::SETLT:
1217  case ISD::SETULT: CondCode = ARMCC::LT; break;
1218  case ISD::SETLE:
1219  case ISD::SETULE: CondCode = ARMCC::LE; break;
1220  case ISD::SETNE:
1221  case ISD::SETUNE: CondCode = ARMCC::NE; break;
1222  }
1223}
1224
1225//===----------------------------------------------------------------------===//
1226//                      Calling Convention Implementation
1227//===----------------------------------------------------------------------===//
1228
1229#include "ARMGenCallingConv.inc"
1230
1231/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1232/// given CallingConvention value.
1233CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1234                                                 bool Return,
1235                                                 bool isVarArg) const {
1236  switch (CC) {
1237  default:
1238    llvm_unreachable("Unsupported calling convention");
1239  case CallingConv::Fast:
1240    if (Subtarget->hasVFP2() && !isVarArg) {
1241      if (!Subtarget->isAAPCS_ABI())
1242        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1243      // For AAPCS ABI targets, just use VFP variant of the calling convention.
1244      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1245    }
1246    // Fallthrough
1247  case CallingConv::C: {
1248    // Use target triple & subtarget features to do actual dispatch.
1249    if (!Subtarget->isAAPCS_ABI())
1250      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1251    else if (Subtarget->hasVFP2() &&
1252             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1253             !isVarArg)
1254      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1255    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1256  }
1257  case CallingConv::ARM_AAPCS_VFP:
1258    if (!isVarArg)
1259      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1260    // Fallthrough
1261  case CallingConv::ARM_AAPCS:
1262    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1263  case CallingConv::ARM_APCS:
1264    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1265  case CallingConv::GHC:
1266    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1267  }
1268}
1269
1270/// LowerCallResult - Lower the result values of a call into the
1271/// appropriate copies out of appropriate physical registers.
1272SDValue
1273ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1274                                   CallingConv::ID CallConv, bool isVarArg,
1275                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1276                                   SDLoc dl, SelectionDAG &DAG,
1277                                   SmallVectorImpl<SDValue> &InVals,
1278                                   bool isThisReturn, SDValue ThisVal) const {
1279
1280  // Assign locations to each value returned by this call.
1281  SmallVector<CCValAssign, 16> RVLocs;
1282  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1283                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
1284  CCInfo.AnalyzeCallResult(Ins,
1285                           CCAssignFnForNode(CallConv, /* Return*/ true,
1286                                             isVarArg));
1287
1288  // Copy all of the result registers out of their specified physreg.
1289  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1290    CCValAssign VA = RVLocs[i];
1291
1292    // Pass 'this' value directly from the argument to return value, to avoid
1293    // reg unit interference
1294    if (i == 0 && isThisReturn) {
1295      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1296             "unexpected return calling convention register assignment");
1297      InVals.push_back(ThisVal);
1298      continue;
1299    }
1300
1301    SDValue Val;
1302    if (VA.needsCustom()) {
1303      // Handle f64 or half of a v2f64.
1304      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1305                                      InFlag);
1306      Chain = Lo.getValue(1);
1307      InFlag = Lo.getValue(2);
1308      VA = RVLocs[++i]; // skip ahead to next loc
1309      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1310                                      InFlag);
1311      Chain = Hi.getValue(1);
1312      InFlag = Hi.getValue(2);
1313      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1314
1315      if (VA.getLocVT() == MVT::v2f64) {
1316        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1317        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1318                          DAG.getConstant(0, MVT::i32));
1319
1320        VA = RVLocs[++i]; // skip ahead to next loc
1321        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1322        Chain = Lo.getValue(1);
1323        InFlag = Lo.getValue(2);
1324        VA = RVLocs[++i]; // skip ahead to next loc
1325        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1326        Chain = Hi.getValue(1);
1327        InFlag = Hi.getValue(2);
1328        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1329        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1330                          DAG.getConstant(1, MVT::i32));
1331      }
1332    } else {
1333      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1334                               InFlag);
1335      Chain = Val.getValue(1);
1336      InFlag = Val.getValue(2);
1337    }
1338
1339    switch (VA.getLocInfo()) {
1340    default: llvm_unreachable("Unknown loc info!");
1341    case CCValAssign::Full: break;
1342    case CCValAssign::BCvt:
1343      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1344      break;
1345    }
1346
1347    InVals.push_back(Val);
1348  }
1349
1350  return Chain;
1351}
1352
1353/// LowerMemOpCallTo - Store the argument to the stack.
1354SDValue
1355ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1356                                    SDValue StackPtr, SDValue Arg,
1357                                    SDLoc dl, SelectionDAG &DAG,
1358                                    const CCValAssign &VA,
1359                                    ISD::ArgFlagsTy Flags) const {
1360  unsigned LocMemOffset = VA.getLocMemOffset();
1361  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1362  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1363  return DAG.getStore(Chain, dl, Arg, PtrOff,
1364                      MachinePointerInfo::getStack(LocMemOffset),
1365                      false, false, 0);
1366}
1367
1368void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
1369                                         SDValue Chain, SDValue &Arg,
1370                                         RegsToPassVector &RegsToPass,
1371                                         CCValAssign &VA, CCValAssign &NextVA,
1372                                         SDValue &StackPtr,
1373                                         SmallVectorImpl<SDValue> &MemOpChains,
1374                                         ISD::ArgFlagsTy Flags) const {
1375
1376  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1377                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
1378  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
1379
1380  if (NextVA.isRegLoc())
1381    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
1382  else {
1383    assert(NextVA.isMemLoc());
1384    if (StackPtr.getNode() == 0)
1385      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1386
1387    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
1388                                           dl, DAG, NextVA,
1389                                           Flags));
1390  }
1391}
1392
1393/// LowerCall - Lowering a call into a callseq_start <-
1394/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1395/// nodes.
1396SDValue
1397ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1398                             SmallVectorImpl<SDValue> &InVals) const {
1399  SelectionDAG &DAG                     = CLI.DAG;
1400  SDLoc &dl                          = CLI.DL;
1401  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1402  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
1403  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
1404  SDValue Chain                         = CLI.Chain;
1405  SDValue Callee                        = CLI.Callee;
1406  bool &isTailCall                      = CLI.IsTailCall;
1407  CallingConv::ID CallConv              = CLI.CallConv;
1408  bool doesNotRet                       = CLI.DoesNotReturn;
1409  bool isVarArg                         = CLI.IsVarArg;
1410
1411  MachineFunction &MF = DAG.getMachineFunction();
1412  bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1413  bool isThisReturn   = false;
1414  bool isSibCall      = false;
1415  // Disable tail calls if they're not supported.
1416  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
1417    isTailCall = false;
1418  if (isTailCall) {
1419    // Check if it's really possible to do a tail call.
1420    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1421                    isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
1422                                                   Outs, OutVals, Ins, DAG);
1423    // We don't support GuaranteedTailCallOpt for ARM, only automatically
1424    // detected sibcalls.
1425    if (isTailCall) {
1426      ++NumTailCalls;
1427      isSibCall = true;
1428    }
1429  }
1430
1431  // Analyze operands of the call, assigning locations to each operand.
1432  SmallVector<CCValAssign, 16> ArgLocs;
1433  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1434                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
1435  CCInfo.AnalyzeCallOperands(Outs,
1436                             CCAssignFnForNode(CallConv, /* Return*/ false,
1437                                               isVarArg));
1438
1439  // Get a count of how many bytes are to be pushed on the stack.
1440  unsigned NumBytes = CCInfo.getNextStackOffset();
1441
1442  // For tail calls, memory operands are available in our caller's stack.
1443  if (isSibCall)
1444    NumBytes = 0;
1445
1446  // Adjust the stack pointer for the new arguments...
1447  // These operations are automatically eliminated by the prolog/epilog pass
1448  if (!isSibCall)
1449    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
1450                                 dl);
1451
1452  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1453
1454  RegsToPassVector RegsToPass;
1455  SmallVector<SDValue, 8> MemOpChains;
1456
1457  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1458  // of tail call optimization, arguments are handled later.
1459  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1460       i != e;
1461       ++i, ++realArgIdx) {
1462    CCValAssign &VA = ArgLocs[i];
1463    SDValue Arg = OutVals[realArgIdx];
1464    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1465    bool isByVal = Flags.isByVal();
1466
1467    // Promote the value if needed.
1468    switch (VA.getLocInfo()) {
1469    default: llvm_unreachable("Unknown loc info!");
1470    case CCValAssign::Full: break;
1471    case CCValAssign::SExt:
1472      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1473      break;
1474    case CCValAssign::ZExt:
1475      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1476      break;
1477    case CCValAssign::AExt:
1478      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1479      break;
1480    case CCValAssign::BCvt:
1481      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1482      break;
1483    }
1484
1485    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1486    if (VA.needsCustom()) {
1487      if (VA.getLocVT() == MVT::v2f64) {
1488        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1489                                  DAG.getConstant(0, MVT::i32));
1490        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1491                                  DAG.getConstant(1, MVT::i32));
1492
1493        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1494                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1495
1496        VA = ArgLocs[++i]; // skip ahead to next loc
1497        if (VA.isRegLoc()) {
1498          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1499                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1500        } else {
1501          assert(VA.isMemLoc());
1502
1503          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1504                                                 dl, DAG, VA, Flags));
1505        }
1506      } else {
1507        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1508                         StackPtr, MemOpChains, Flags);
1509      }
1510    } else if (VA.isRegLoc()) {
1511      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
1512        assert(VA.getLocVT() == MVT::i32 &&
1513               "unexpected calling convention register assignment");
1514        assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
1515               "unexpected use of 'returned'");
1516        isThisReturn = true;
1517      }
1518      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1519    } else if (isByVal) {
1520      assert(VA.isMemLoc());
1521      unsigned offset = 0;
1522
1523      // True if this byval aggregate will be split between registers
1524      // and memory.
1525      unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
1526      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
1527
1528      if (CurByValIdx < ByValArgsCount) {
1529
1530        unsigned RegBegin, RegEnd;
1531        CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
1532
1533        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1534        unsigned int i, j;
1535        for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
1536          SDValue Const = DAG.getConstant(4*i, MVT::i32);
1537          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1538          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1539                                     MachinePointerInfo(),
1540                                     false, false, false, 0);
1541          MemOpChains.push_back(Load.getValue(1));
1542          RegsToPass.push_back(std::make_pair(j, Load));
1543        }
1544
1545        // If parameter size outsides register area, "offset" value
1546        // helps us to calculate stack slot for remained part properly.
1547        offset = RegEnd - RegBegin;
1548
1549        CCInfo.nextInRegsParam();
1550      }
1551
1552      if (Flags.getByValSize() > 4*offset) {
1553        unsigned LocMemOffset = VA.getLocMemOffset();
1554        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
1555        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
1556                                  StkPtrOff);
1557        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
1558        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
1559        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
1560                                           MVT::i32);
1561        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
1562
1563        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1564        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1565        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1566                                          Ops, array_lengthof(Ops)));
1567      }
1568    } else if (!isSibCall) {
1569      assert(VA.isMemLoc());
1570
1571      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1572                                             dl, DAG, VA, Flags));
1573    }
1574  }
1575
1576  if (!MemOpChains.empty())
1577    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1578                        &MemOpChains[0], MemOpChains.size());
1579
1580  // Build a sequence of copy-to-reg nodes chained together with token chain
1581  // and flag operands which copy the outgoing args into the appropriate regs.
1582  SDValue InFlag;
1583  // Tail call byval lowering might overwrite argument registers so in case of
1584  // tail call optimization the copies to registers are lowered later.
1585  if (!isTailCall)
1586    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1587      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1588                               RegsToPass[i].second, InFlag);
1589      InFlag = Chain.getValue(1);
1590    }
1591
1592  // For tail calls lower the arguments to the 'real' stack slot.
1593  if (isTailCall) {
1594    // Force all the incoming stack arguments to be loaded from the stack
1595    // before any new outgoing arguments are stored to the stack, because the
1596    // outgoing stack slots may alias the incoming argument stack slots, and
1597    // the alias isn't otherwise explicit. This is slightly more conservative
1598    // than necessary, because it means that each store effectively depends
1599    // on every argument instead of just those arguments it would clobber.
1600
1601    // Do not flag preceding copytoreg stuff together with the following stuff.
1602    InFlag = SDValue();
1603    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1604      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1605                               RegsToPass[i].second, InFlag);
1606      InFlag = Chain.getValue(1);
1607    }
1608    InFlag = SDValue();
1609  }
1610
1611  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1612  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1613  // node so that legalize doesn't hack it.
1614  bool isDirect = false;
1615  bool isARMFunc = false;
1616  bool isLocalARMFunc = false;
1617  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1618
1619  if (EnableARMLongCalls) {
1620    assert (getTargetMachine().getRelocationModel() == Reloc::Static
1621            && "long-calls with non-static relocation model!");
1622    // Handle a global address or an external symbol. If it's not one of
1623    // those, the target's already in a register, so we don't need to do
1624    // anything extra.
1625    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1626      const GlobalValue *GV = G->getGlobal();
1627      // Create a constant pool entry for the callee address
1628      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1629      ARMConstantPoolValue *CPV =
1630        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1631
1632      // Get the address of the callee into a register
1633      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1634      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1635      Callee = DAG.getLoad(getPointerTy(), dl,
1636                           DAG.getEntryNode(), CPAddr,
1637                           MachinePointerInfo::getConstantPool(),
1638                           false, false, false, 0);
1639    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1640      const char *Sym = S->getSymbol();
1641
1642      // Create a constant pool entry for the callee address
1643      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1644      ARMConstantPoolValue *CPV =
1645        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1646                                      ARMPCLabelIndex, 0);
1647      // Get the address of the callee into a register
1648      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1649      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1650      Callee = DAG.getLoad(getPointerTy(), dl,
1651                           DAG.getEntryNode(), CPAddr,
1652                           MachinePointerInfo::getConstantPool(),
1653                           false, false, false, 0);
1654    }
1655  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1656    const GlobalValue *GV = G->getGlobal();
1657    isDirect = true;
1658    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
1659    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
1660                   getTargetMachine().getRelocationModel() != Reloc::Static;
1661    isARMFunc = !Subtarget->isThumb() || isStub;
1662    // ARM call to a local ARM function is predicable.
1663    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
1664    // tBX takes a register source operand.
1665    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1666      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1667      ARMConstantPoolValue *CPV =
1668        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
1669      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1670      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1671      Callee = DAG.getLoad(getPointerTy(), dl,
1672                           DAG.getEntryNode(), CPAddr,
1673                           MachinePointerInfo::getConstantPool(),
1674                           false, false, false, 0);
1675      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1676      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1677                           getPointerTy(), Callee, PICLabel);
1678    } else {
1679      // On ELF targets for PIC code, direct calls should go through the PLT
1680      unsigned OpFlags = 0;
1681      if (Subtarget->isTargetELF() &&
1682          getTargetMachine().getRelocationModel() == Reloc::PIC_)
1683        OpFlags = ARMII::MO_PLT;
1684      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
1685    }
1686  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1687    isDirect = true;
1688    bool isStub = Subtarget->isTargetDarwin() &&
1689                  getTargetMachine().getRelocationModel() != Reloc::Static;
1690    isARMFunc = !Subtarget->isThumb() || isStub;
1691    // tBX takes a register source operand.
1692    const char *Sym = S->getSymbol();
1693    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1694      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1695      ARMConstantPoolValue *CPV =
1696        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1697                                      ARMPCLabelIndex, 4);
1698      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1699      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1700      Callee = DAG.getLoad(getPointerTy(), dl,
1701                           DAG.getEntryNode(), CPAddr,
1702                           MachinePointerInfo::getConstantPool(),
1703                           false, false, false, 0);
1704      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1705      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1706                           getPointerTy(), Callee, PICLabel);
1707    } else {
1708      unsigned OpFlags = 0;
1709      // On ELF targets for PIC code, direct calls should go through the PLT
1710      if (Subtarget->isTargetELF() &&
1711                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
1712        OpFlags = ARMII::MO_PLT;
1713      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
1714    }
1715  }
1716
1717  // FIXME: handle tail calls differently.
1718  unsigned CallOpc;
1719  bool HasMinSizeAttr = MF.getFunction()->getAttributes().
1720    hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
1721  if (Subtarget->isThumb()) {
1722    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1723      CallOpc = ARMISD::CALL_NOLINK;
1724    else
1725      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
1726  } else {
1727    if (!isDirect && !Subtarget->hasV5TOps())
1728      CallOpc = ARMISD::CALL_NOLINK;
1729    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
1730               // Emit regular call when code size is the priority
1731               !HasMinSizeAttr)
1732      // "mov lr, pc; b _foo" to avoid confusing the RSP
1733      CallOpc = ARMISD::CALL_NOLINK;
1734    else
1735      CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
1736  }
1737
1738  std::vector<SDValue> Ops;
1739  Ops.push_back(Chain);
1740  Ops.push_back(Callee);
1741
1742  // Add argument registers to the end of the list so that they are known live
1743  // into the call.
1744  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1745    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1746                                  RegsToPass[i].second.getValueType()));
1747
1748  // Add a register mask operand representing the call-preserved registers.
1749  const uint32_t *Mask;
1750  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1751  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
1752  if (isThisReturn) {
1753    // For 'this' returns, use the R0-preserving mask if applicable
1754    Mask = ARI->getThisReturnPreservedMask(CallConv);
1755    if (!Mask) {
1756      // Set isThisReturn to false if the calling convention is not one that
1757      // allows 'returned' to be modeled in this way, so LowerCallResult does
1758      // not try to pass 'this' straight through
1759      isThisReturn = false;
1760      Mask = ARI->getCallPreservedMask(CallConv);
1761    }
1762  } else
1763    Mask = ARI->getCallPreservedMask(CallConv);
1764
1765  assert(Mask && "Missing call preserved mask for calling convention");
1766  Ops.push_back(DAG.getRegisterMask(Mask));
1767
1768  if (InFlag.getNode())
1769    Ops.push_back(InFlag);
1770
1771  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1772  if (isTailCall)
1773    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1774
1775  // Returns a chain and a flag for retval copy to use.
1776  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
1777  InFlag = Chain.getValue(1);
1778
1779  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1780                             DAG.getIntPtrConstant(0, true), InFlag, dl);
1781  if (!Ins.empty())
1782    InFlag = Chain.getValue(1);
1783
1784  // Handle result values, copying them out of physregs into vregs that we
1785  // return.
1786  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
1787                         InVals, isThisReturn,
1788                         isThisReturn ? OutVals[0] : SDValue());
1789}
1790
1791/// HandleByVal - Every parameter *after* a byval parameter is passed
1792/// on the stack.  Remember the next parameter register to allocate,
1793/// and then confiscate the rest of the parameter registers to insure
1794/// this.
1795void
1796ARMTargetLowering::HandleByVal(
1797    CCState *State, unsigned &size, unsigned Align) const {
1798  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
1799  assert((State->getCallOrPrologue() == Prologue ||
1800          State->getCallOrPrologue() == Call) &&
1801         "unhandled ParmContext");
1802
1803  // For in-prologue parameters handling, we also introduce stack offset
1804  // for byval registers: see CallingConvLower.cpp, CCState::HandleByVal.
1805  // This behaviour outsides AAPCS rules (5.5 Parameters Passing) of how
1806  // NSAA should be evaluted (NSAA means "next stacked argument address").
1807  // So: NextStackOffset = NSAAOffset + SizeOfByValParamsStoredInRegs.
1808  // Then: NSAAOffset = NextStackOffset - SizeOfByValParamsStoredInRegs.
1809  unsigned NSAAOffset = State->getNextStackOffset();
1810  if (State->getCallOrPrologue() != Call) {
1811    for (unsigned i = 0, e = State->getInRegsParamsCount(); i != e; ++i) {
1812      unsigned RB, RE;
1813      State->getInRegsParamInfo(i, RB, RE);
1814      assert(NSAAOffset >= (RE-RB)*4 &&
1815             "Stack offset for byval regs doesn't introduced anymore?");
1816      NSAAOffset -= (RE-RB)*4;
1817    }
1818  }
1819  if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
1820    if (Subtarget->isAAPCS_ABI() && Align > 4) {
1821      unsigned AlignInRegs = Align / 4;
1822      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
1823      for (unsigned i = 0; i < Waste; ++i)
1824        reg = State->AllocateReg(GPRArgRegs, 4);
1825    }
1826    if (reg != 0) {
1827      unsigned excess = 4 * (ARM::R4 - reg);
1828
1829      // Special case when NSAA != SP and parameter size greater than size of
1830      // all remained GPR regs. In that case we can't split parameter, we must
1831      // send it to stack. We also must set NCRN to R4, so waste all
1832      // remained registers.
1833      if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
1834        while (State->AllocateReg(GPRArgRegs, 4))
1835          ;
1836        return;
1837      }
1838
1839      // First register for byval parameter is the first register that wasn't
1840      // allocated before this method call, so it would be "reg".
1841      // If parameter is small enough to be saved in range [reg, r4), then
1842      // the end (first after last) register would be reg + param-size-in-regs,
1843      // else parameter would be splitted between registers and stack,
1844      // end register would be r4 in this case.
1845      unsigned ByValRegBegin = reg;
1846      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
1847      State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
1848      // Note, first register is allocated in the beginning of function already,
1849      // allocate remained amount of registers we need.
1850      for (unsigned i = reg+1; i != ByValRegEnd; ++i)
1851        State->AllocateReg(GPRArgRegs, 4);
1852      // At a call site, a byval parameter that is split between
1853      // registers and memory needs its size truncated here.  In a
1854      // function prologue, such byval parameters are reassembled in
1855      // memory, and are not truncated.
1856      if (State->getCallOrPrologue() == Call) {
1857        // Make remained size equal to 0 in case, when
1858        // the whole structure may be stored into registers.
1859        if (size < excess)
1860          size = 0;
1861        else
1862          size -= excess;
1863      }
1864    }
1865  }
1866}
1867
1868/// MatchingStackOffset - Return true if the given stack call argument is
1869/// already available in the same position (relatively) of the caller's
1870/// incoming argument stack.
1871static
1872bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
1873                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
1874                         const TargetInstrInfo *TII) {
1875  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
1876  int FI = INT_MAX;
1877  if (Arg.getOpcode() == ISD::CopyFromReg) {
1878    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
1879    if (!TargetRegisterInfo::isVirtualRegister(VR))
1880      return false;
1881    MachineInstr *Def = MRI->getVRegDef(VR);
1882    if (!Def)
1883      return false;
1884    if (!Flags.isByVal()) {
1885      if (!TII->isLoadFromStackSlot(Def, FI))
1886        return false;
1887    } else {
1888      return false;
1889    }
1890  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
1891    if (Flags.isByVal())
1892      // ByVal argument is passed in as a pointer but it's now being
1893      // dereferenced. e.g.
1894      // define @foo(%struct.X* %A) {
1895      //   tail call @bar(%struct.X* byval %A)
1896      // }
1897      return false;
1898    SDValue Ptr = Ld->getBasePtr();
1899    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
1900    if (!FINode)
1901      return false;
1902    FI = FINode->getIndex();
1903  } else
1904    return false;
1905
1906  assert(FI != INT_MAX);
1907  if (!MFI->isFixedObjectIndex(FI))
1908    return false;
1909  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
1910}
1911
1912/// IsEligibleForTailCallOptimization - Check whether the call is eligible
1913/// for tail call optimization. Targets which want to do tail call
1914/// optimization should implement this function.
1915bool
1916ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1917                                                     CallingConv::ID CalleeCC,
1918                                                     bool isVarArg,
1919                                                     bool isCalleeStructRet,
1920                                                     bool isCallerStructRet,
1921                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1922                                    const SmallVectorImpl<SDValue> &OutVals,
1923                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1924                                                     SelectionDAG& DAG) const {
1925  const Function *CallerF = DAG.getMachineFunction().getFunction();
1926  CallingConv::ID CallerCC = CallerF->getCallingConv();
1927  bool CCMatch = CallerCC == CalleeCC;
1928
1929  // Look for obvious safe cases to perform tail call optimization that do not
1930  // require ABI changes. This is what gcc calls sibcall.
1931
1932  // Do not sibcall optimize vararg calls unless the call site is not passing
1933  // any arguments.
1934  if (isVarArg && !Outs.empty())
1935    return false;
1936
1937  // Also avoid sibcall optimization if either caller or callee uses struct
1938  // return semantics.
1939  if (isCalleeStructRet || isCallerStructRet)
1940    return false;
1941
1942  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
1943  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
1944  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
1945  // support in the assembler and linker to be used. This would need to be
1946  // fixed to fully support tail calls in Thumb1.
1947  //
1948  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
1949  // LR.  This means if we need to reload LR, it takes an extra instructions,
1950  // which outweighs the value of the tail call; but here we don't know yet
1951  // whether LR is going to be used.  Probably the right approach is to
1952  // generate the tail call here and turn it back into CALL/RET in
1953  // emitEpilogue if LR is used.
1954
1955  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
1956  // but we need to make sure there are enough registers; the only valid
1957  // registers are the 4 used for parameters.  We don't currently do this
1958  // case.
1959  if (Subtarget->isThumb1Only())
1960    return false;
1961
1962  // If the calling conventions do not match, then we'd better make sure the
1963  // results are returned in the same way as what the caller expects.
1964  if (!CCMatch) {
1965    SmallVector<CCValAssign, 16> RVLocs1;
1966    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1967                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
1968    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
1969
1970    SmallVector<CCValAssign, 16> RVLocs2;
1971    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1972                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
1973    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
1974
1975    if (RVLocs1.size() != RVLocs2.size())
1976      return false;
1977    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1978      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1979        return false;
1980      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1981        return false;
1982      if (RVLocs1[i].isRegLoc()) {
1983        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1984          return false;
1985      } else {
1986        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1987          return false;
1988      }
1989    }
1990  }
1991
1992  // If Caller's vararg or byval argument has been split between registers and
1993  // stack, do not perform tail call, since part of the argument is in caller's
1994  // local frame.
1995  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
1996                                      getInfo<ARMFunctionInfo>();
1997  if (AFI_Caller->getArgRegsSaveSize())
1998    return false;
1999
2000  // If the callee takes no arguments then go on to check the results of the
2001  // call.
2002  if (!Outs.empty()) {
2003    // Check if stack adjustment is needed. For now, do not do this if any
2004    // argument is passed on the stack.
2005    SmallVector<CCValAssign, 16> ArgLocs;
2006    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
2007                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
2008    CCInfo.AnalyzeCallOperands(Outs,
2009                               CCAssignFnForNode(CalleeCC, false, isVarArg));
2010    if (CCInfo.getNextStackOffset()) {
2011      MachineFunction &MF = DAG.getMachineFunction();
2012
2013      // Check if the arguments are already laid out in the right way as
2014      // the caller's fixed stack objects.
2015      MachineFrameInfo *MFI = MF.getFrameInfo();
2016      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2017      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
2018      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2019           i != e;
2020           ++i, ++realArgIdx) {
2021        CCValAssign &VA = ArgLocs[i];
2022        EVT RegVT = VA.getLocVT();
2023        SDValue Arg = OutVals[realArgIdx];
2024        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2025        if (VA.getLocInfo() == CCValAssign::Indirect)
2026          return false;
2027        if (VA.needsCustom()) {
2028          // f64 and vector types are split into multiple registers or
2029          // register/stack-slot combinations.  The types will not match
2030          // the registers; give up on memory f64 refs until we figure
2031          // out what to do about this.
2032          if (!VA.isRegLoc())
2033            return false;
2034          if (!ArgLocs[++i].isRegLoc())
2035            return false;
2036          if (RegVT == MVT::v2f64) {
2037            if (!ArgLocs[++i].isRegLoc())
2038              return false;
2039            if (!ArgLocs[++i].isRegLoc())
2040              return false;
2041          }
2042        } else if (!VA.isRegLoc()) {
2043          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2044                                   MFI, MRI, TII))
2045            return false;
2046        }
2047      }
2048    }
2049  }
2050
2051  return true;
2052}
2053
2054bool
2055ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2056                                  MachineFunction &MF, bool isVarArg,
2057                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
2058                                  LLVMContext &Context) const {
2059  SmallVector<CCValAssign, 16> RVLocs;
2060  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
2061  return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
2062                                                    isVarArg));
2063}
2064
2065SDValue
2066ARMTargetLowering::LowerReturn(SDValue Chain,
2067                               CallingConv::ID CallConv, bool isVarArg,
2068                               const SmallVectorImpl<ISD::OutputArg> &Outs,
2069                               const SmallVectorImpl<SDValue> &OutVals,
2070                               SDLoc dl, SelectionDAG &DAG) const {
2071
2072  // CCValAssign - represent the assignment of the return value to a location.
2073  SmallVector<CCValAssign, 16> RVLocs;
2074
2075  // CCState - Info about the registers and stack slots.
2076  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2077                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
2078
2079  // Analyze outgoing return values.
2080  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
2081                                               isVarArg));
2082
2083  SDValue Flag;
2084  SmallVector<SDValue, 4> RetOps;
2085  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2086
2087  // Copy the result values into the output registers.
2088  for (unsigned i = 0, realRVLocIdx = 0;
2089       i != RVLocs.size();
2090       ++i, ++realRVLocIdx) {
2091    CCValAssign &VA = RVLocs[i];
2092    assert(VA.isRegLoc() && "Can only return in registers!");
2093
2094    SDValue Arg = OutVals[realRVLocIdx];
2095
2096    switch (VA.getLocInfo()) {
2097    default: llvm_unreachable("Unknown loc info!");
2098    case CCValAssign::Full: break;
2099    case CCValAssign::BCvt:
2100      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2101      break;
2102    }
2103
2104    if (VA.needsCustom()) {
2105      if (VA.getLocVT() == MVT::v2f64) {
2106        // Extract the first half and return it in two registers.
2107        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2108                                   DAG.getConstant(0, MVT::i32));
2109        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2110                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
2111
2112        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
2113        Flag = Chain.getValue(1);
2114        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2115        VA = RVLocs[++i]; // skip ahead to next loc
2116        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2117                                 HalfGPRs.getValue(1), Flag);
2118        Flag = Chain.getValue(1);
2119        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2120        VA = RVLocs[++i]; // skip ahead to next loc
2121
2122        // Extract the 2nd half and fall through to handle it as an f64 value.
2123        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2124                          DAG.getConstant(1, MVT::i32));
2125      }
2126      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2127      // available.
2128      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2129                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
2130      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
2131      Flag = Chain.getValue(1);
2132      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2133      VA = RVLocs[++i]; // skip ahead to next loc
2134      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
2135                               Flag);
2136    } else
2137      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2138
2139    // Guarantee that all emitted copies are
2140    // stuck together, avoiding something bad.
2141    Flag = Chain.getValue(1);
2142    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2143  }
2144
2145  // Update chain and glue.
2146  RetOps[0] = Chain;
2147  if (Flag.getNode())
2148    RetOps.push_back(Flag);
2149
2150  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
2151                     RetOps.data(), RetOps.size());
2152}
2153
2154bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2155  if (N->getNumValues() != 1)
2156    return false;
2157  if (!N->hasNUsesOfValue(1, 0))
2158    return false;
2159
2160  SDValue TCChain = Chain;
2161  SDNode *Copy = *N->use_begin();
2162  if (Copy->getOpcode() == ISD::CopyToReg) {
2163    // If the copy has a glue operand, we conservatively assume it isn't safe to
2164    // perform a tail call.
2165    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2166      return false;
2167    TCChain = Copy->getOperand(0);
2168  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2169    SDNode *VMov = Copy;
2170    // f64 returned in a pair of GPRs.
2171    SmallPtrSet<SDNode*, 2> Copies;
2172    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2173         UI != UE; ++UI) {
2174      if (UI->getOpcode() != ISD::CopyToReg)
2175        return false;
2176      Copies.insert(*UI);
2177    }
2178    if (Copies.size() > 2)
2179      return false;
2180
2181    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2182         UI != UE; ++UI) {
2183      SDValue UseChain = UI->getOperand(0);
2184      if (Copies.count(UseChain.getNode()))
2185        // Second CopyToReg
2186        Copy = *UI;
2187      else
2188        // First CopyToReg
2189        TCChain = UseChain;
2190    }
2191  } else if (Copy->getOpcode() == ISD::BITCAST) {
2192    // f32 returned in a single GPR.
2193    if (!Copy->hasOneUse())
2194      return false;
2195    Copy = *Copy->use_begin();
2196    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2197      return false;
2198    TCChain = Copy->getOperand(0);
2199  } else {
2200    return false;
2201  }
2202
2203  bool HasRet = false;
2204  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2205       UI != UE; ++UI) {
2206    if (UI->getOpcode() != ARMISD::RET_FLAG)
2207      return false;
2208    HasRet = true;
2209  }
2210
2211  if (!HasRet)
2212    return false;
2213
2214  Chain = TCChain;
2215  return true;
2216}
2217
2218bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2219  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
2220    return false;
2221
2222  if (!CI->isTailCall())
2223    return false;
2224
2225  return !Subtarget->isThumb1Only();
2226}
2227
2228// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2229// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2230// one of the above mentioned nodes. It has to be wrapped because otherwise
2231// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2232// be used to form addressing mode. These wrapped nodes will be selected
2233// into MOVi.
2234static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2235  EVT PtrVT = Op.getValueType();
2236  // FIXME there is no actual debug info here
2237  SDLoc dl(Op);
2238  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2239  SDValue Res;
2240  if (CP->isMachineConstantPoolEntry())
2241    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2242                                    CP->getAlignment());
2243  else
2244    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2245                                    CP->getAlignment());
2246  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2247}
2248
2249unsigned ARMTargetLowering::getJumpTableEncoding() const {
2250  return MachineJumpTableInfo::EK_Inline;
2251}
2252
2253SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2254                                             SelectionDAG &DAG) const {
2255  MachineFunction &MF = DAG.getMachineFunction();
2256  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2257  unsigned ARMPCLabelIndex = 0;
2258  SDLoc DL(Op);
2259  EVT PtrVT = getPointerTy();
2260  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2261  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2262  SDValue CPAddr;
2263  if (RelocM == Reloc::Static) {
2264    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2265  } else {
2266    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2267    ARMPCLabelIndex = AFI->createPICLabelUId();
2268    ARMConstantPoolValue *CPV =
2269      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2270                                      ARMCP::CPBlockAddress, PCAdj);
2271    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2272  }
2273  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2274  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
2275                               MachinePointerInfo::getConstantPool(),
2276                               false, false, false, 0);
2277  if (RelocM == Reloc::Static)
2278    return Result;
2279  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2280  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2281}
2282
2283// Lower ISD::GlobalTLSAddress using the "general dynamic" model
2284SDValue
2285ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2286                                                 SelectionDAG &DAG) const {
2287  SDLoc dl(GA);
2288  EVT PtrVT = getPointerTy();
2289  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2290  MachineFunction &MF = DAG.getMachineFunction();
2291  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2292  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2293  ARMConstantPoolValue *CPV =
2294    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2295                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2296  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2297  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2298  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
2299                         MachinePointerInfo::getConstantPool(),
2300                         false, false, false, 0);
2301  SDValue Chain = Argument.getValue(1);
2302
2303  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2304  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2305
2306  // call __tls_get_addr.
2307  ArgListTy Args;
2308  ArgListEntry Entry;
2309  Entry.Node = Argument;
2310  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2311  Args.push_back(Entry);
2312  // FIXME: is there useful debug info available here?
2313  TargetLowering::CallLoweringInfo CLI(Chain,
2314                (Type *) Type::getInt32Ty(*DAG.getContext()),
2315                false, false, false, false,
2316                0, CallingConv::C, /*isTailCall=*/false,
2317                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
2318                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
2319  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2320  return CallResult.first;
2321}
2322
2323// Lower ISD::GlobalTLSAddress using the "initial exec" or
2324// "local exec" model.
2325SDValue
2326ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2327                                        SelectionDAG &DAG,
2328                                        TLSModel::Model model) const {
2329  const GlobalValue *GV = GA->getGlobal();
2330  SDLoc dl(GA);
2331  SDValue Offset;
2332  SDValue Chain = DAG.getEntryNode();
2333  EVT PtrVT = getPointerTy();
2334  // Get the Thread Pointer
2335  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2336
2337  if (model == TLSModel::InitialExec) {
2338    MachineFunction &MF = DAG.getMachineFunction();
2339    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2340    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2341    // Initial exec model.
2342    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2343    ARMConstantPoolValue *CPV =
2344      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2345                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2346                                      true);
2347    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2348    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2349    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2350                         MachinePointerInfo::getConstantPool(),
2351                         false, false, false, 0);
2352    Chain = Offset.getValue(1);
2353
2354    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2355    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2356
2357    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2358                         MachinePointerInfo::getConstantPool(),
2359                         false, false, false, 0);
2360  } else {
2361    // local exec model
2362    assert(model == TLSModel::LocalExec);
2363    ARMConstantPoolValue *CPV =
2364      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2365    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2366    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2367    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2368                         MachinePointerInfo::getConstantPool(),
2369                         false, false, false, 0);
2370  }
2371
2372  // The address of the thread local variable is the add of the thread
2373  // pointer with the offset of the variable.
2374  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2375}
2376
2377SDValue
2378ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2379  // TODO: implement the "local dynamic" model
2380  assert(Subtarget->isTargetELF() &&
2381         "TLS not implemented for non-ELF targets");
2382  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2383
2384  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2385
2386  switch (model) {
2387    case TLSModel::GeneralDynamic:
2388    case TLSModel::LocalDynamic:
2389      return LowerToTLSGeneralDynamicModel(GA, DAG);
2390    case TLSModel::InitialExec:
2391    case TLSModel::LocalExec:
2392      return LowerToTLSExecModels(GA, DAG, model);
2393  }
2394  llvm_unreachable("bogus TLS model");
2395}
2396
2397SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
2398                                                 SelectionDAG &DAG) const {
2399  EVT PtrVT = getPointerTy();
2400  SDLoc dl(Op);
2401  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2402  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2403    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
2404    ARMConstantPoolValue *CPV =
2405      ARMConstantPoolConstant::Create(GV,
2406                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
2407    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2408    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2409    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
2410                                 CPAddr,
2411                                 MachinePointerInfo::getConstantPool(),
2412                                 false, false, false, 0);
2413    SDValue Chain = Result.getValue(1);
2414    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2415    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
2416    if (!UseGOTOFF)
2417      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2418                           MachinePointerInfo::getGOT(),
2419                           false, false, false, 0);
2420    return Result;
2421  }
2422
2423  // If we have T2 ops, we can materialize the address directly via movt/movw
2424  // pair. This is always cheaper.
2425  if (Subtarget->useMovt()) {
2426    ++NumMovwMovt;
2427    // FIXME: Once remat is capable of dealing with instructions with register
2428    // operands, expand this into two nodes.
2429    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2430                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2431  } else {
2432    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2433    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2434    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2435                       MachinePointerInfo::getConstantPool(),
2436                       false, false, false, 0);
2437  }
2438}
2439
2440SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2441                                                    SelectionDAG &DAG) const {
2442  EVT PtrVT = getPointerTy();
2443  SDLoc dl(Op);
2444  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2445  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2446
2447  // FIXME: Enable this for static codegen when tool issues are fixed.  Also
2448  // update ARMFastISel::ARMMaterializeGV.
2449  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
2450    ++NumMovwMovt;
2451    // FIXME: Once remat is capable of dealing with instructions with register
2452    // operands, expand this into two nodes.
2453    if (RelocM == Reloc::Static)
2454      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2455                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2456
2457    unsigned Wrapper = (RelocM == Reloc::PIC_)
2458      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
2459    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
2460                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2461    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2462      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2463                           MachinePointerInfo::getGOT(),
2464                           false, false, false, 0);
2465    return Result;
2466  }
2467
2468  unsigned ARMPCLabelIndex = 0;
2469  SDValue CPAddr;
2470  if (RelocM == Reloc::Static) {
2471    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2472  } else {
2473    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2474    ARMPCLabelIndex = AFI->createPICLabelUId();
2475    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
2476    ARMConstantPoolValue *CPV =
2477      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
2478                                      PCAdj);
2479    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2480  }
2481  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2482
2483  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2484                               MachinePointerInfo::getConstantPool(),
2485                               false, false, false, 0);
2486  SDValue Chain = Result.getValue(1);
2487
2488  if (RelocM == Reloc::PIC_) {
2489    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2490    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2491  }
2492
2493  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2494    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
2495                         false, false, false, 0);
2496
2497  return Result;
2498}
2499
2500SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
2501                                                    SelectionDAG &DAG) const {
2502  assert(Subtarget->isTargetELF() &&
2503         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
2504  MachineFunction &MF = DAG.getMachineFunction();
2505  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2506  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2507  EVT PtrVT = getPointerTy();
2508  SDLoc dl(Op);
2509  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2510  ARMConstantPoolValue *CPV =
2511    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
2512                                  ARMPCLabelIndex, PCAdj);
2513  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2514  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2515  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2516                               MachinePointerInfo::getConstantPool(),
2517                               false, false, false, 0);
2518  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2519  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2520}
2521
2522SDValue
2523ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2524  SDLoc dl(Op);
2525  SDValue Val = DAG.getConstant(0, MVT::i32);
2526  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
2527                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
2528                     Op.getOperand(1), Val);
2529}
2530
2531SDValue
2532ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2533  SDLoc dl(Op);
2534  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2535                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
2536}
2537
2538SDValue
2539ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2540                                          const ARMSubtarget *Subtarget) const {
2541  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2542  SDLoc dl(Op);
2543  switch (IntNo) {
2544  default: return SDValue();    // Don't custom lower most intrinsics.
2545  case Intrinsic::arm_thread_pointer: {
2546    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2547    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2548  }
2549  case Intrinsic::eh_sjlj_lsda: {
2550    MachineFunction &MF = DAG.getMachineFunction();
2551    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2552    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2553    EVT PtrVT = getPointerTy();
2554    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2555    SDValue CPAddr;
2556    unsigned PCAdj = (RelocM != Reloc::PIC_)
2557      ? 0 : (Subtarget->isThumb() ? 4 : 8);
2558    ARMConstantPoolValue *CPV =
2559      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
2560                                      ARMCP::CPLSDA, PCAdj);
2561    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2562    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2563    SDValue Result =
2564      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2565                  MachinePointerInfo::getConstantPool(),
2566                  false, false, false, 0);
2567
2568    if (RelocM == Reloc::PIC_) {
2569      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2570      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2571    }
2572    return Result;
2573  }
2574  case Intrinsic::arm_neon_vmulls:
2575  case Intrinsic::arm_neon_vmullu: {
2576    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2577      ? ARMISD::VMULLs : ARMISD::VMULLu;
2578    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
2579                       Op.getOperand(1), Op.getOperand(2));
2580  }
2581  }
2582}
2583
2584static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
2585                                 const ARMSubtarget *Subtarget) {
2586  // FIXME: handle "fence singlethread" more efficiently.
2587  SDLoc dl(Op);
2588  if (!Subtarget->hasDataBarrier()) {
2589    // Some ARMv6 cpus can support data barriers with an mcr instruction.
2590    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2591    // here.
2592    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2593           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
2594    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2595                       DAG.getConstant(0, MVT::i32));
2596  }
2597
2598  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
2599  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
2600  unsigned Domain = ARM_MB::ISH;
2601  if (Subtarget->isSwift() && Ord == Release) {
2602    // Swift happens to implement ISHST barriers in a way that's compatible with
2603    // Release semantics but weaker than ISH so we'd be fools not to use
2604    // it. Beware: other processors probably don't!
2605    Domain = ARM_MB::ISHST;
2606  }
2607
2608  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
2609                     DAG.getConstant(Domain, MVT::i32));
2610}
2611
2612static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
2613                             const ARMSubtarget *Subtarget) {
2614  // ARM pre v5TE and Thumb1 does not have preload instructions.
2615  if (!(Subtarget->isThumb2() ||
2616        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
2617    // Just preserve the chain.
2618    return Op.getOperand(0);
2619
2620  SDLoc dl(Op);
2621  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
2622  if (!isRead &&
2623      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
2624    // ARMv7 with MP extension has PLDW.
2625    return Op.getOperand(0);
2626
2627  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2628  if (Subtarget->isThumb()) {
2629    // Invert the bits.
2630    isRead = ~isRead & 1;
2631    isData = ~isData & 1;
2632  }
2633
2634  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
2635                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
2636                     DAG.getConstant(isData, MVT::i32));
2637}
2638
2639static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
2640  MachineFunction &MF = DAG.getMachineFunction();
2641  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
2642
2643  // vastart just stores the address of the VarArgsFrameIndex slot into the
2644  // memory location argument.
2645  SDLoc dl(Op);
2646  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2647  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
2648  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2649  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
2650                      MachinePointerInfo(SV), false, false, 0);
2651}
2652
2653SDValue
2654ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
2655                                        SDValue &Root, SelectionDAG &DAG,
2656                                        SDLoc dl) const {
2657  MachineFunction &MF = DAG.getMachineFunction();
2658  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2659
2660  const TargetRegisterClass *RC;
2661  if (AFI->isThumb1OnlyFunction())
2662    RC = &ARM::tGPRRegClass;
2663  else
2664    RC = &ARM::GPRRegClass;
2665
2666  // Transform the arguments stored in physical registers into virtual ones.
2667  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2668  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2669
2670  SDValue ArgValue2;
2671  if (NextVA.isMemLoc()) {
2672    MachineFrameInfo *MFI = MF.getFrameInfo();
2673    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
2674
2675    // Create load node to retrieve arguments from the stack.
2676    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2677    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
2678                            MachinePointerInfo::getFixedStack(FI),
2679                            false, false, false, 0);
2680  } else {
2681    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2682    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2683  }
2684
2685  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
2686}
2687
2688void
2689ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
2690                                  unsigned InRegsParamRecordIdx,
2691                                  unsigned ArgSize,
2692                                  unsigned &ArgRegsSize,
2693                                  unsigned &ArgRegsSaveSize)
2694  const {
2695  unsigned NumGPRs;
2696  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
2697    unsigned RBegin, REnd;
2698    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
2699    NumGPRs = REnd - RBegin;
2700  } else {
2701    unsigned int firstUnalloced;
2702    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
2703                                                sizeof(GPRArgRegs) /
2704                                                sizeof(GPRArgRegs[0]));
2705    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
2706  }
2707
2708  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
2709  ArgRegsSize = NumGPRs * 4;
2710
2711  // If parameter is split between stack and GPRs...
2712  if (NumGPRs && Align == 8 &&
2713      (ArgRegsSize < ArgSize ||
2714        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
2715    // Add padding for part of param recovered from GPRs, so
2716    // its last byte must be at address K*8 - 1.
2717    // We need to do it, since remained (stack) part of parameter has
2718    // stack alignment, and we need to "attach" "GPRs head" without gaps
2719    // to it:
2720    // Stack:
2721    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
2722    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
2723    //
2724    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2725    unsigned Padding =
2726        ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
2727        (ArgRegsSize + AFI->getArgRegsSaveSize());
2728    ArgRegsSaveSize = ArgRegsSize + Padding;
2729  } else
2730    // We don't need to extend regs save size for byval parameters if they
2731    // are passed via GPRs only.
2732    ArgRegsSaveSize = ArgRegsSize;
2733}
2734
2735// The remaining GPRs hold either the beginning of variable-argument
2736// data, or the beginning of an aggregate passed by value (usually
2737// byval).  Either way, we allocate stack slots adjacent to the data
2738// provided by our caller, and store the unallocated registers there.
2739// If this is a variadic function, the va_list pointer will begin with
2740// these values; otherwise, this reassembles a (byval) structure that
2741// was split between registers and memory.
2742// Return: The frame index registers were stored into.
2743int
2744ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
2745                                  SDLoc dl, SDValue &Chain,
2746                                  const Value *OrigArg,
2747                                  unsigned InRegsParamRecordIdx,
2748                                  unsigned OffsetFromOrigArg,
2749                                  unsigned ArgOffset,
2750                                  unsigned ArgSize,
2751                                  bool ForceMutable) const {
2752
2753  // Currently, two use-cases possible:
2754  // Case #1. Non var-args function, and we meet first byval parameter.
2755  //          Setup first unallocated register as first byval register;
2756  //          eat all remained registers
2757  //          (these two actions are performed by HandleByVal method).
2758  //          Then, here, we initialize stack frame with
2759  //          "store-reg" instructions.
2760  // Case #2. Var-args function, that doesn't contain byval parameters.
2761  //          The same: eat all remained unallocated registers,
2762  //          initialize stack frame.
2763
2764  MachineFunction &MF = DAG.getMachineFunction();
2765  MachineFrameInfo *MFI = MF.getFrameInfo();
2766  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2767  unsigned firstRegToSaveIndex, lastRegToSaveIndex;
2768  unsigned RBegin, REnd;
2769  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
2770    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
2771    firstRegToSaveIndex = RBegin - ARM::R0;
2772    lastRegToSaveIndex = REnd - ARM::R0;
2773  } else {
2774    firstRegToSaveIndex = CCInfo.getFirstUnallocated
2775      (GPRArgRegs, array_lengthof(GPRArgRegs));
2776    lastRegToSaveIndex = 4;
2777  }
2778
2779  unsigned ArgRegsSize, ArgRegsSaveSize;
2780  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
2781                 ArgRegsSize, ArgRegsSaveSize);
2782
2783  // Store any by-val regs to their spots on the stack so that they may be
2784  // loaded by deferencing the result of formal parameter pointer or va_next.
2785  // Note: once stack area for byval/varargs registers
2786  // was initialized, it can't be initialized again.
2787  if (ArgRegsSaveSize) {
2788
2789    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
2790
2791    if (Padding) {
2792      assert(AFI->getStoredByValParamsPadding() == 0 &&
2793             "The only parameter may be padded.");
2794      AFI->setStoredByValParamsPadding(Padding);
2795    }
2796
2797    int FrameIndex = MFI->CreateFixedObject(
2798                      ArgRegsSaveSize,
2799                      Padding + ArgOffset,
2800                      false);
2801    SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
2802
2803    SmallVector<SDValue, 4> MemOps;
2804    for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
2805         ++firstRegToSaveIndex, ++i) {
2806      const TargetRegisterClass *RC;
2807      if (AFI->isThumb1OnlyFunction())
2808        RC = &ARM::tGPRRegClass;
2809      else
2810        RC = &ARM::GPRRegClass;
2811
2812      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
2813      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
2814      SDValue Store =
2815        DAG.getStore(Val.getValue(1), dl, Val, FIN,
2816                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
2817                     false, false, 0);
2818      MemOps.push_back(Store);
2819      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
2820                        DAG.getConstant(4, getPointerTy()));
2821    }
2822
2823    AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
2824
2825    if (!MemOps.empty())
2826      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2827                          &MemOps[0], MemOps.size());
2828    return FrameIndex;
2829  } else
2830    // This will point to the next argument passed via stack.
2831    return MFI->CreateFixedObject(
2832        4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
2833}
2834
2835// Setup stack frame, the va_list pointer will start from.
2836void
2837ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
2838                                        SDLoc dl, SDValue &Chain,
2839                                        unsigned ArgOffset,
2840                                        bool ForceMutable) const {
2841  MachineFunction &MF = DAG.getMachineFunction();
2842  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2843
2844  // Try to store any remaining integer argument regs
2845  // to their spots on the stack so that they may be loaded by deferencing
2846  // the result of va_next.
2847  // If there is no regs to be stored, just point address after last
2848  // argument passed via stack.
2849  int FrameIndex =
2850    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
2851                   0, ArgOffset, 0, ForceMutable);
2852
2853  AFI->setVarArgsFrameIndex(FrameIndex);
2854}
2855
2856SDValue
2857ARMTargetLowering::LowerFormalArguments(SDValue Chain,
2858                                        CallingConv::ID CallConv, bool isVarArg,
2859                                        const SmallVectorImpl<ISD::InputArg>
2860                                          &Ins,
2861                                        SDLoc dl, SelectionDAG &DAG,
2862                                        SmallVectorImpl<SDValue> &InVals)
2863                                          const {
2864  MachineFunction &MF = DAG.getMachineFunction();
2865  MachineFrameInfo *MFI = MF.getFrameInfo();
2866
2867  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2868
2869  // Assign locations to all of the incoming arguments.
2870  SmallVector<CCValAssign, 16> ArgLocs;
2871  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2872                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
2873  CCInfo.AnalyzeFormalArguments(Ins,
2874                                CCAssignFnForNode(CallConv, /* Return*/ false,
2875                                                  isVarArg));
2876
2877  SmallVector<SDValue, 16> ArgValues;
2878  int lastInsIndex = -1;
2879  SDValue ArgValue;
2880  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
2881  unsigned CurArgIdx = 0;
2882
2883  // Initially ArgRegsSaveSize is zero.
2884  // Then we increase this value each time we meet byval parameter.
2885  // We also increase this value in case of varargs function.
2886  AFI->setArgRegsSaveSize(0);
2887
2888  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2889    CCValAssign &VA = ArgLocs[i];
2890    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
2891    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
2892    // Arguments stored in registers.
2893    if (VA.isRegLoc()) {
2894      EVT RegVT = VA.getLocVT();
2895
2896      if (VA.needsCustom()) {
2897        // f64 and vector types are split up into multiple registers or
2898        // combinations of registers and stack slots.
2899        if (VA.getLocVT() == MVT::v2f64) {
2900          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
2901                                                   Chain, DAG, dl);
2902          VA = ArgLocs[++i]; // skip ahead to next loc
2903          SDValue ArgValue2;
2904          if (VA.isMemLoc()) {
2905            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
2906            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2907            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
2908                                    MachinePointerInfo::getFixedStack(FI),
2909                                    false, false, false, 0);
2910          } else {
2911            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
2912                                             Chain, DAG, dl);
2913          }
2914          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2915          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2916                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
2917          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2918                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
2919        } else
2920          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
2921
2922      } else {
2923        const TargetRegisterClass *RC;
2924
2925        if (RegVT == MVT::f32)
2926          RC = &ARM::SPRRegClass;
2927        else if (RegVT == MVT::f64)
2928          RC = &ARM::DPRRegClass;
2929        else if (RegVT == MVT::v2f64)
2930          RC = &ARM::QPRRegClass;
2931        else if (RegVT == MVT::i32)
2932          RC = AFI->isThumb1OnlyFunction() ?
2933            (const TargetRegisterClass*)&ARM::tGPRRegClass :
2934            (const TargetRegisterClass*)&ARM::GPRRegClass;
2935        else
2936          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2937
2938        // Transform the arguments in physical registers into virtual ones.
2939        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2940        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2941      }
2942
2943      // If this is an 8 or 16-bit value, it is really passed promoted
2944      // to 32 bits.  Insert an assert[sz]ext to capture this, then
2945      // truncate to the right size.
2946      switch (VA.getLocInfo()) {
2947      default: llvm_unreachable("Unknown loc info!");
2948      case CCValAssign::Full: break;
2949      case CCValAssign::BCvt:
2950        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2951        break;
2952      case CCValAssign::SExt:
2953        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2954                               DAG.getValueType(VA.getValVT()));
2955        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2956        break;
2957      case CCValAssign::ZExt:
2958        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2959                               DAG.getValueType(VA.getValVT()));
2960        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2961        break;
2962      }
2963
2964      InVals.push_back(ArgValue);
2965
2966    } else { // VA.isRegLoc()
2967
2968      // sanity check
2969      assert(VA.isMemLoc());
2970      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
2971
2972      int index = ArgLocs[i].getValNo();
2973
2974      // Some Ins[] entries become multiple ArgLoc[] entries.
2975      // Process them only once.
2976      if (index != lastInsIndex)
2977        {
2978          ISD::ArgFlagsTy Flags = Ins[index].Flags;
2979          // FIXME: For now, all byval parameter objects are marked mutable.
2980          // This can be changed with more analysis.
2981          // In case of tail call optimization mark all arguments mutable.
2982          // Since they could be overwritten by lowering of arguments in case of
2983          // a tail call.
2984          if (Flags.isByVal()) {
2985            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
2986            int FrameIndex = StoreByValRegs(
2987                CCInfo, DAG, dl, Chain, CurOrigArg,
2988                CurByValIndex,
2989                Ins[VA.getValNo()].PartOffset,
2990                VA.getLocMemOffset(),
2991                Flags.getByValSize(),
2992                true /*force mutable frames*/);
2993            InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
2994            CCInfo.nextInRegsParam();
2995          } else {
2996            unsigned FIOffset = VA.getLocMemOffset() +
2997                                AFI->getStoredByValParamsPadding();
2998            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
2999                                            FIOffset, true);
3000
3001            // Create load nodes to retrieve arguments from the stack.
3002            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
3003            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3004                                         MachinePointerInfo::getFixedStack(FI),
3005                                         false, false, false, 0));
3006          }
3007          lastInsIndex = index;
3008        }
3009    }
3010  }
3011
3012  // varargs
3013  if (isVarArg)
3014    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3015                         CCInfo.getNextStackOffset());
3016
3017  return Chain;
3018}
3019
3020/// isFloatingPointZero - Return true if this is +0.0.
3021static bool isFloatingPointZero(SDValue Op) {
3022  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
3023    return CFP->getValueAPF().isPosZero();
3024  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
3025    // Maybe this has already been legalized into the constant pool?
3026    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
3027      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
3028      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
3029        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
3030          return CFP->getValueAPF().isPosZero();
3031    }
3032  }
3033  return false;
3034}
3035
3036/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
3037/// the given operands.
3038SDValue
3039ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3040                             SDValue &ARMcc, SelectionDAG &DAG,
3041                             SDLoc dl) const {
3042  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3043    unsigned C = RHSC->getZExtValue();
3044    if (!isLegalICmpImmediate(C)) {
3045      // Constant does not fit, try adjusting it by one?
3046      switch (CC) {
3047      default: break;
3048      case ISD::SETLT:
3049      case ISD::SETGE:
3050        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
3051          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3052          RHS = DAG.getConstant(C-1, MVT::i32);
3053        }
3054        break;
3055      case ISD::SETULT:
3056      case ISD::SETUGE:
3057        if (C != 0 && isLegalICmpImmediate(C-1)) {
3058          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3059          RHS = DAG.getConstant(C-1, MVT::i32);
3060        }
3061        break;
3062      case ISD::SETLE:
3063      case ISD::SETGT:
3064        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
3065          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3066          RHS = DAG.getConstant(C+1, MVT::i32);
3067        }
3068        break;
3069      case ISD::SETULE:
3070      case ISD::SETUGT:
3071        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
3072          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3073          RHS = DAG.getConstant(C+1, MVT::i32);
3074        }
3075        break;
3076      }
3077    }
3078  }
3079
3080  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3081  ARMISD::NodeType CompareType;
3082  switch (CondCode) {
3083  default:
3084    CompareType = ARMISD::CMP;
3085    break;
3086  case ARMCC::EQ:
3087  case ARMCC::NE:
3088    // Uses only Z Flag
3089    CompareType = ARMISD::CMPZ;
3090    break;
3091  }
3092  ARMcc = DAG.getConstant(CondCode, MVT::i32);
3093  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
3094}
3095
3096/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
3097SDValue
3098ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
3099                             SDLoc dl) const {
3100  SDValue Cmp;
3101  if (!isFloatingPointZero(RHS))
3102    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
3103  else
3104    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
3105  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
3106}
3107
3108/// duplicateCmp - Glue values can have only one use, so this function
3109/// duplicates a comparison node.
3110SDValue
3111ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
3112  unsigned Opc = Cmp.getOpcode();
3113  SDLoc DL(Cmp);
3114  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
3115    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3116
3117  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
3118  Cmp = Cmp.getOperand(0);
3119  Opc = Cmp.getOpcode();
3120  if (Opc == ARMISD::CMPFP)
3121    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
3122  else {
3123    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
3124    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
3125  }
3126  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
3127}
3128
3129SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
3130  SDValue Cond = Op.getOperand(0);
3131  SDValue SelectTrue = Op.getOperand(1);
3132  SDValue SelectFalse = Op.getOperand(2);
3133  SDLoc dl(Op);
3134
3135  // Convert:
3136  //
3137  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
3138  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
3139  //
3140  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
3141    const ConstantSDNode *CMOVTrue =
3142      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
3143    const ConstantSDNode *CMOVFalse =
3144      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3145
3146    if (CMOVTrue && CMOVFalse) {
3147      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
3148      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
3149
3150      SDValue True;
3151      SDValue False;
3152      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
3153        True = SelectTrue;
3154        False = SelectFalse;
3155      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
3156        True = SelectFalse;
3157        False = SelectTrue;
3158      }
3159
3160      if (True.getNode() && False.getNode()) {
3161        EVT VT = Op.getValueType();
3162        SDValue ARMcc = Cond.getOperand(2);
3163        SDValue CCR = Cond.getOperand(3);
3164        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
3165        assert(True.getValueType() == VT);
3166        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
3167      }
3168    }
3169  }
3170
3171  // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
3172  // undefined bits before doing a full-word comparison with zero.
3173  Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
3174                     DAG.getConstant(1, Cond.getValueType()));
3175
3176  return DAG.getSelectCC(dl, Cond,
3177                         DAG.getConstant(0, Cond.getValueType()),
3178                         SelectTrue, SelectFalse, ISD::SETNE);
3179}
3180
3181SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
3182  EVT VT = Op.getValueType();
3183  SDValue LHS = Op.getOperand(0);
3184  SDValue RHS = Op.getOperand(1);
3185  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3186  SDValue TrueVal = Op.getOperand(2);
3187  SDValue FalseVal = Op.getOperand(3);
3188  SDLoc dl(Op);
3189
3190  if (LHS.getValueType() == MVT::i32) {
3191    SDValue ARMcc;
3192    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3193    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3194    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
3195  }
3196
3197  ARMCC::CondCodes CondCode, CondCode2;
3198  FPCCToARMCC(CC, CondCode, CondCode2);
3199
3200  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
3201  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3202  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3203  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
3204                               ARMcc, CCR, Cmp);
3205  if (CondCode2 != ARMCC::AL) {
3206    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
3207    // FIXME: Needs another CMP because flag can have but one use.
3208    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
3209    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
3210                         Result, TrueVal, ARMcc2, CCR, Cmp2);
3211  }
3212  return Result;
3213}
3214
3215/// canChangeToInt - Given the fp compare operand, return true if it is suitable
3216/// to morph to an integer compare sequence.
3217static bool canChangeToInt(SDValue Op, bool &SeenZero,
3218                           const ARMSubtarget *Subtarget) {
3219  SDNode *N = Op.getNode();
3220  if (!N->hasOneUse())
3221    // Otherwise it requires moving the value from fp to integer registers.
3222    return false;
3223  if (!N->getNumValues())
3224    return false;
3225  EVT VT = Op.getValueType();
3226  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
3227    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
3228    // vmrs are very slow, e.g. cortex-a8.
3229    return false;
3230
3231  if (isFloatingPointZero(Op)) {
3232    SeenZero = true;
3233    return true;
3234  }
3235  return ISD::isNormalLoad(N);
3236}
3237
3238static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
3239  if (isFloatingPointZero(Op))
3240    return DAG.getConstant(0, MVT::i32);
3241
3242  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
3243    return DAG.getLoad(MVT::i32, SDLoc(Op),
3244                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
3245                       Ld->isVolatile(), Ld->isNonTemporal(),
3246                       Ld->isInvariant(), Ld->getAlignment());
3247
3248  llvm_unreachable("Unknown VFP cmp argument!");
3249}
3250
3251static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
3252                           SDValue &RetVal1, SDValue &RetVal2) {
3253  if (isFloatingPointZero(Op)) {
3254    RetVal1 = DAG.getConstant(0, MVT::i32);
3255    RetVal2 = DAG.getConstant(0, MVT::i32);
3256    return;
3257  }
3258
3259  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
3260    SDValue Ptr = Ld->getBasePtr();
3261    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
3262                          Ld->getChain(), Ptr,
3263                          Ld->getPointerInfo(),
3264                          Ld->isVolatile(), Ld->isNonTemporal(),
3265                          Ld->isInvariant(), Ld->getAlignment());
3266
3267    EVT PtrType = Ptr.getValueType();
3268    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
3269    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
3270                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
3271    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
3272                          Ld->getChain(), NewPtr,
3273                          Ld->getPointerInfo().getWithOffset(4),
3274                          Ld->isVolatile(), Ld->isNonTemporal(),
3275                          Ld->isInvariant(), NewAlign);
3276    return;
3277  }
3278
3279  llvm_unreachable("Unknown VFP cmp argument!");
3280}
3281
3282/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
3283/// f32 and even f64 comparisons to integer ones.
3284SDValue
3285ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
3286  SDValue Chain = Op.getOperand(0);
3287  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3288  SDValue LHS = Op.getOperand(2);
3289  SDValue RHS = Op.getOperand(3);
3290  SDValue Dest = Op.getOperand(4);
3291  SDLoc dl(Op);
3292
3293  bool LHSSeenZero = false;
3294  bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
3295  bool RHSSeenZero = false;
3296  bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
3297  if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
3298    // If unsafe fp math optimization is enabled and there are no other uses of
3299    // the CMP operands, and the condition code is EQ or NE, we can optimize it
3300    // to an integer comparison.
3301    if (CC == ISD::SETOEQ)
3302      CC = ISD::SETEQ;
3303    else if (CC == ISD::SETUNE)
3304      CC = ISD::SETNE;
3305
3306    SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
3307    SDValue ARMcc;
3308    if (LHS.getValueType() == MVT::f32) {
3309      LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3310                        bitcastf32Toi32(LHS, DAG), Mask);
3311      RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3312                        bitcastf32Toi32(RHS, DAG), Mask);
3313      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3314      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3315      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3316                         Chain, Dest, ARMcc, CCR, Cmp);
3317    }
3318
3319    SDValue LHS1, LHS2;
3320    SDValue RHS1, RHS2;
3321    expandf64Toi32(LHS, DAG, LHS1, LHS2);
3322    expandf64Toi32(RHS, DAG, RHS1, RHS2);
3323    LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
3324    RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
3325    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3326    ARMcc = DAG.getConstant(CondCode, MVT::i32);
3327    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3328    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
3329    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
3330  }
3331
3332  return SDValue();
3333}
3334
3335SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3336  SDValue Chain = Op.getOperand(0);
3337  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3338  SDValue LHS = Op.getOperand(2);
3339  SDValue RHS = Op.getOperand(3);
3340  SDValue Dest = Op.getOperand(4);
3341  SDLoc dl(Op);
3342
3343  if (LHS.getValueType() == MVT::i32) {
3344    SDValue ARMcc;
3345    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3346    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3347    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3348                       Chain, Dest, ARMcc, CCR, Cmp);
3349  }
3350
3351  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3352
3353  if (getTargetMachine().Options.UnsafeFPMath &&
3354      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
3355       CC == ISD::SETNE || CC == ISD::SETUNE)) {
3356    SDValue Result = OptimizeVFPBrcond(Op, DAG);
3357    if (Result.getNode())
3358      return Result;
3359  }
3360
3361  ARMCC::CondCodes CondCode, CondCode2;
3362  FPCCToARMCC(CC, CondCode, CondCode2);
3363
3364  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
3365  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3366  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3367  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3368  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
3369  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3370  if (CondCode2 != ARMCC::AL) {
3371    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
3372    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
3373    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3374  }
3375  return Res;
3376}
3377
3378SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3379  SDValue Chain = Op.getOperand(0);
3380  SDValue Table = Op.getOperand(1);
3381  SDValue Index = Op.getOperand(2);
3382  SDLoc dl(Op);
3383
3384  EVT PTy = getPointerTy();
3385  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
3386  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3387  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
3388  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
3389  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
3390  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
3391  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
3392  if (Subtarget->isThumb2()) {
3393    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
3394    // which does another jump to the destination. This also makes it easier
3395    // to translate it to TBB / TBH later.
3396    // FIXME: This might not work if the function is extremely large.
3397    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
3398                       Addr, Op.getOperand(2), JTI, UId);
3399  }
3400  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
3401    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
3402                       MachinePointerInfo::getJumpTable(),
3403                       false, false, false, 0);
3404    Chain = Addr.getValue(1);
3405    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
3406    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3407  } else {
3408    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
3409                       MachinePointerInfo::getJumpTable(),
3410                       false, false, false, 0);
3411    Chain = Addr.getValue(1);
3412    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3413  }
3414}
3415
3416static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3417  EVT VT = Op.getValueType();
3418  SDLoc dl(Op);
3419
3420  if (Op.getValueType().getVectorElementType() == MVT::i32) {
3421    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
3422      return Op;
3423    return DAG.UnrollVectorOp(Op.getNode());
3424  }
3425
3426  assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
3427         "Invalid type for custom lowering!");
3428  if (VT != MVT::v4i16)
3429    return DAG.UnrollVectorOp(Op.getNode());
3430
3431  Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
3432  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
3433}
3434
3435static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3436  EVT VT = Op.getValueType();
3437  if (VT.isVector())
3438    return LowerVectorFP_TO_INT(Op, DAG);
3439
3440  SDLoc dl(Op);
3441  unsigned Opc;
3442
3443  switch (Op.getOpcode()) {
3444  default: llvm_unreachable("Invalid opcode!");
3445  case ISD::FP_TO_SINT:
3446    Opc = ARMISD::FTOSI;
3447    break;
3448  case ISD::FP_TO_UINT:
3449    Opc = ARMISD::FTOUI;
3450    break;
3451  }
3452  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
3453  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
3454}
3455
3456static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3457  EVT VT = Op.getValueType();
3458  SDLoc dl(Op);
3459
3460  if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
3461    if (VT.getVectorElementType() == MVT::f32)
3462      return Op;
3463    return DAG.UnrollVectorOp(Op.getNode());
3464  }
3465
3466  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
3467         "Invalid type for custom lowering!");
3468  if (VT != MVT::v4f32)
3469    return DAG.UnrollVectorOp(Op.getNode());
3470
3471  unsigned CastOpc;
3472  unsigned Opc;
3473  switch (Op.getOpcode()) {
3474  default: llvm_unreachable("Invalid opcode!");
3475  case ISD::SINT_TO_FP:
3476    CastOpc = ISD::SIGN_EXTEND;
3477    Opc = ISD::SINT_TO_FP;
3478    break;
3479  case ISD::UINT_TO_FP:
3480    CastOpc = ISD::ZERO_EXTEND;
3481    Opc = ISD::UINT_TO_FP;
3482    break;
3483  }
3484
3485  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
3486  return DAG.getNode(Opc, dl, VT, Op);
3487}
3488
3489static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3490  EVT VT = Op.getValueType();
3491  if (VT.isVector())
3492    return LowerVectorINT_TO_FP(Op, DAG);
3493
3494  SDLoc dl(Op);
3495  unsigned Opc;
3496
3497  switch (Op.getOpcode()) {
3498  default: llvm_unreachable("Invalid opcode!");
3499  case ISD::SINT_TO_FP:
3500    Opc = ARMISD::SITOF;
3501    break;
3502  case ISD::UINT_TO_FP:
3503    Opc = ARMISD::UITOF;
3504    break;
3505  }
3506
3507  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
3508  return DAG.getNode(Opc, dl, VT, Op);
3509}
3510
3511SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
3512  // Implement fcopysign with a fabs and a conditional fneg.
3513  SDValue Tmp0 = Op.getOperand(0);
3514  SDValue Tmp1 = Op.getOperand(1);
3515  SDLoc dl(Op);
3516  EVT VT = Op.getValueType();
3517  EVT SrcVT = Tmp1.getValueType();
3518  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
3519    Tmp0.getOpcode() == ARMISD::VMOVDRR;
3520  bool UseNEON = !InGPR && Subtarget->hasNEON();
3521
3522  if (UseNEON) {
3523    // Use VBSL to copy the sign bit.
3524    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
3525    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
3526                               DAG.getTargetConstant(EncodedVal, MVT::i32));
3527    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
3528    if (VT == MVT::f64)
3529      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3530                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
3531                         DAG.getConstant(32, MVT::i32));
3532    else /*if (VT == MVT::f32)*/
3533      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
3534    if (SrcVT == MVT::f32) {
3535      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
3536      if (VT == MVT::f64)
3537        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3538                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
3539                           DAG.getConstant(32, MVT::i32));
3540    } else if (VT == MVT::f32)
3541      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
3542                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
3543                         DAG.getConstant(32, MVT::i32));
3544    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
3545    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
3546
3547    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
3548                                            MVT::i32);
3549    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
3550    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
3551                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
3552
3553    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
3554                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
3555                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
3556    if (VT == MVT::f32) {
3557      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
3558      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
3559                        DAG.getConstant(0, MVT::i32));
3560    } else {
3561      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
3562    }
3563
3564    return Res;
3565  }
3566
3567  // Bitcast operand 1 to i32.
3568  if (SrcVT == MVT::f64)
3569    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3570                       &Tmp1, 1).getValue(1);
3571  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
3572
3573  // Or in the signbit with integer operations.
3574  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
3575  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
3576  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
3577  if (VT == MVT::f32) {
3578    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
3579                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
3580    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
3581                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
3582  }
3583
3584  // f64: Or the high part with signbit and then combine two parts.
3585  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3586                     &Tmp0, 1);
3587  SDValue Lo = Tmp0.getValue(0);
3588  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
3589  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
3590  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
3591}
3592
3593SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
3594  MachineFunction &MF = DAG.getMachineFunction();
3595  MachineFrameInfo *MFI = MF.getFrameInfo();
3596  MFI->setReturnAddressIsTaken(true);
3597
3598  EVT VT = Op.getValueType();
3599  SDLoc dl(Op);
3600  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3601  if (Depth) {
3602    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
3603    SDValue Offset = DAG.getConstant(4, MVT::i32);
3604    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
3605                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
3606                       MachinePointerInfo(), false, false, false, 0);
3607  }
3608
3609  // Return LR, which contains the return address. Mark it an implicit live-in.
3610  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3611  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
3612}
3613
3614SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
3615  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3616  MFI->setFrameAddressIsTaken(true);
3617
3618  EVT VT = Op.getValueType();
3619  SDLoc dl(Op);  // FIXME probably not meaningful
3620  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3621  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
3622    ? ARM::R7 : ARM::R11;
3623  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
3624  while (Depth--)
3625    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
3626                            MachinePointerInfo(),
3627                            false, false, false, 0);
3628  return FrameAddr;
3629}
3630
3631/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
3632/// and size(DestVec) > 128-bits.
3633/// This is achieved by doing the one extension from the SrcVec, splitting the
3634/// result, extending these parts, and then concatenating these into the
3635/// destination.
3636static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
3637  SDValue Op = N->getOperand(0);
3638  EVT SrcVT = Op.getValueType();
3639  EVT DestVT = N->getValueType(0);
3640
3641  assert(DestVT.getSizeInBits() > 128 &&
3642         "Custom sext/zext expansion needs >128-bit vector.");
3643  // If this is a normal length extension, use the default expansion.
3644  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
3645      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
3646    return SDValue();
3647
3648  SDLoc dl(N);
3649  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
3650  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
3651  unsigned NumElts = SrcVT.getVectorNumElements();
3652  LLVMContext &Ctx = *DAG.getContext();
3653  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
3654
3655  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
3656                               NumElts);
3657  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
3658                                 NumElts/2);
3659  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
3660                               NumElts/2);
3661
3662  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
3663  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
3664                        DAG.getIntPtrConstant(0));
3665  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
3666                        DAG.getIntPtrConstant(NumElts/2));
3667  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
3668  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
3669  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
3670}
3671
3672/// ExpandBITCAST - If the target supports VFP, this function is called to
3673/// expand a bit convert where either the source or destination type is i64 to
3674/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
3675/// operand type is illegal (e.g., v2f32 for a target that doesn't support
3676/// vectors), since the legalizer won't know what to do with that.
3677static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
3678  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3679  SDLoc dl(N);
3680  SDValue Op = N->getOperand(0);
3681
3682  // This function is only supposed to be called for i64 types, either as the
3683  // source or destination of the bit convert.
3684  EVT SrcVT = Op.getValueType();
3685  EVT DstVT = N->getValueType(0);
3686  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
3687         "ExpandBITCAST called for non-i64 type");
3688
3689  // Turn i64->f64 into VMOVDRR.
3690  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
3691    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3692                             DAG.getConstant(0, MVT::i32));
3693    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3694                             DAG.getConstant(1, MVT::i32));
3695    return DAG.getNode(ISD::BITCAST, dl, DstVT,
3696                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
3697  }
3698
3699  // Turn f64->i64 into VMOVRRD.
3700  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
3701    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
3702                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
3703    // Merge the pieces into a single i64 value.
3704    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
3705  }
3706
3707  return SDValue();
3708}
3709
3710/// getZeroVector - Returns a vector of specified type with all zero elements.
3711/// Zero vectors are used to represent vector negation and in those cases
3712/// will be implemented with the NEON VNEG instruction.  However, VNEG does
3713/// not support i64 elements, so sometimes the zero vectors will need to be
3714/// explicitly constructed.  Regardless, use a canonical VMOV to create the
3715/// zero vector.
3716static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
3717  assert(VT.isVector() && "Expected a vector type");
3718  // The canonical modified immediate encoding of a zero vector is....0!
3719  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
3720  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
3721  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
3722  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3723}
3724
3725/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
3726/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3727SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
3728                                                SelectionDAG &DAG) const {
3729  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3730  EVT VT = Op.getValueType();
3731  unsigned VTBits = VT.getSizeInBits();
3732  SDLoc dl(Op);
3733  SDValue ShOpLo = Op.getOperand(0);
3734  SDValue ShOpHi = Op.getOperand(1);
3735  SDValue ShAmt  = Op.getOperand(2);
3736  SDValue ARMcc;
3737  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
3738
3739  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
3740
3741  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3742                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3743  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
3744  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3745                                   DAG.getConstant(VTBits, MVT::i32));
3746  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
3747  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3748  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
3749
3750  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3751  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3752                          ARMcc, DAG, dl);
3753  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
3754  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
3755                           CCR, Cmp);
3756
3757  SDValue Ops[2] = { Lo, Hi };
3758  return DAG.getMergeValues(Ops, 2, dl);
3759}
3760
3761/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
3762/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3763SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
3764                                               SelectionDAG &DAG) const {
3765  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3766  EVT VT = Op.getValueType();
3767  unsigned VTBits = VT.getSizeInBits();
3768  SDLoc dl(Op);
3769  SDValue ShOpLo = Op.getOperand(0);
3770  SDValue ShOpHi = Op.getOperand(1);
3771  SDValue ShAmt  = Op.getOperand(2);
3772  SDValue ARMcc;
3773
3774  assert(Op.getOpcode() == ISD::SHL_PARTS);
3775  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3776                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3777  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
3778  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3779                                   DAG.getConstant(VTBits, MVT::i32));
3780  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
3781  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
3782
3783  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3784  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3785  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3786                          ARMcc, DAG, dl);
3787  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
3788  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
3789                           CCR, Cmp);
3790
3791  SDValue Ops[2] = { Lo, Hi };
3792  return DAG.getMergeValues(Ops, 2, dl);
3793}
3794
3795SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3796                                            SelectionDAG &DAG) const {
3797  // The rounding mode is in bits 23:22 of the FPSCR.
3798  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3799  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3800  // so that the shift + and get folded into a bitfield extract.
3801  SDLoc dl(Op);
3802  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
3803                              DAG.getConstant(Intrinsic::arm_get_fpscr,
3804                                              MVT::i32));
3805  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
3806                                  DAG.getConstant(1U << 22, MVT::i32));
3807  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3808                              DAG.getConstant(22, MVT::i32));
3809  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3810                     DAG.getConstant(3, MVT::i32));
3811}
3812
3813static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
3814                         const ARMSubtarget *ST) {
3815  EVT VT = N->getValueType(0);
3816  SDLoc dl(N);
3817
3818  if (!ST->hasV6T2Ops())
3819    return SDValue();
3820
3821  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
3822  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
3823}
3824
3825/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
3826/// for each 16-bit element from operand, repeated.  The basic idea is to
3827/// leverage vcnt to get the 8-bit counts, gather and add the results.
3828///
3829/// Trace for v4i16:
3830/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
3831/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
3832/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
3833/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
3834///            [b0 b1 b2 b3 b4 b5 b6 b7]
3835///           +[b1 b0 b3 b2 b5 b4 b7 b6]
3836/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
3837/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
3838static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
3839  EVT VT = N->getValueType(0);
3840  SDLoc DL(N);
3841
3842  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
3843  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
3844  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
3845  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
3846  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
3847  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
3848}
3849
3850/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
3851/// bit-count for each 16-bit element from the operand.  We need slightly
3852/// different sequencing for v4i16 and v8i16 to stay within NEON's available
3853/// 64/128-bit registers.
3854///
3855/// Trace for v4i16:
3856/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
3857/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
3858/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
3859/// v4i16:Extracted = [k0    k1    k2    k3    ]
3860static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
3861  EVT VT = N->getValueType(0);
3862  SDLoc DL(N);
3863
3864  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
3865  if (VT.is64BitVector()) {
3866    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
3867    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
3868                       DAG.getIntPtrConstant(0));
3869  } else {
3870    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
3871                                    BitCounts, DAG.getIntPtrConstant(0));
3872    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
3873  }
3874}
3875
3876/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
3877/// bit-count for each 32-bit element from the operand.  The idea here is
3878/// to split the vector into 16-bit elements, leverage the 16-bit count
3879/// routine, and then combine the results.
3880///
3881/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
3882/// input    = [v0    v1    ] (vi: 32-bit elements)
3883/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
3884/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
3885/// vrev: N0 = [k1 k0 k3 k2 ]
3886///            [k0 k1 k2 k3 ]
3887///       N1 =+[k1 k0 k3 k2 ]
3888///            [k0 k2 k1 k3 ]
3889///       N2 =+[k1 k3 k0 k2 ]
3890///            [k0    k2    k1    k3    ]
3891/// Extended =+[k1    k3    k0    k2    ]
3892///            [k0    k2    ]
3893/// Extracted=+[k1    k3    ]
3894///
3895static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
3896  EVT VT = N->getValueType(0);
3897  SDLoc DL(N);
3898
3899  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
3900
3901  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
3902  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
3903  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
3904  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
3905  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
3906
3907  if (VT.is64BitVector()) {
3908    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
3909    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
3910                       DAG.getIntPtrConstant(0));
3911  } else {
3912    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
3913                                    DAG.getIntPtrConstant(0));
3914    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
3915  }
3916}
3917
3918static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
3919                          const ARMSubtarget *ST) {
3920  EVT VT = N->getValueType(0);
3921
3922  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
3923  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
3924          VT == MVT::v4i16 || VT == MVT::v8i16) &&
3925         "Unexpected type for custom ctpop lowering");
3926
3927  if (VT.getVectorElementType() == MVT::i32)
3928    return lowerCTPOP32BitElements(N, DAG);
3929  else
3930    return lowerCTPOP16BitElements(N, DAG);
3931}
3932
3933static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
3934                          const ARMSubtarget *ST) {
3935  EVT VT = N->getValueType(0);
3936  SDLoc dl(N);
3937
3938  if (!VT.isVector())
3939    return SDValue();
3940
3941  // Lower vector shifts on NEON to use VSHL.
3942  assert(ST->hasNEON() && "unexpected vector shift");
3943
3944  // Left shifts translate directly to the vshiftu intrinsic.
3945  if (N->getOpcode() == ISD::SHL)
3946    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3947                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
3948                       N->getOperand(0), N->getOperand(1));
3949
3950  assert((N->getOpcode() == ISD::SRA ||
3951          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
3952
3953  // NEON uses the same intrinsics for both left and right shifts.  For
3954  // right shifts, the shift amounts are negative, so negate the vector of
3955  // shift amounts.
3956  EVT ShiftVT = N->getOperand(1).getValueType();
3957  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
3958                                     getZeroVector(ShiftVT, DAG, dl),
3959                                     N->getOperand(1));
3960  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
3961                             Intrinsic::arm_neon_vshifts :
3962                             Intrinsic::arm_neon_vshiftu);
3963  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3964                     DAG.getConstant(vshiftInt, MVT::i32),
3965                     N->getOperand(0), NegatedCount);
3966}
3967
3968static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
3969                                const ARMSubtarget *ST) {
3970  EVT VT = N->getValueType(0);
3971  SDLoc dl(N);
3972
3973  // We can get here for a node like i32 = ISD::SHL i32, i64
3974  if (VT != MVT::i64)
3975    return SDValue();
3976
3977  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
3978         "Unknown shift to lower!");
3979
3980  // We only lower SRA, SRL of 1 here, all others use generic lowering.
3981  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
3982      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
3983    return SDValue();
3984
3985  // If we are in thumb mode, we don't have RRX.
3986  if (ST->isThumb1Only()) return SDValue();
3987
3988  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
3989  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3990                           DAG.getConstant(0, MVT::i32));
3991  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3992                           DAG.getConstant(1, MVT::i32));
3993
3994  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
3995  // captures the result into a carry flag.
3996  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
3997  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
3998
3999  // The low part is an ARMISD::RRX operand, which shifts the carry in.
4000  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
4001
4002  // Merge the pieces into a single i64 value.
4003 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
4004}
4005
4006static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
4007  SDValue TmpOp0, TmpOp1;
4008  bool Invert = false;
4009  bool Swap = false;
4010  unsigned Opc = 0;
4011
4012  SDValue Op0 = Op.getOperand(0);
4013  SDValue Op1 = Op.getOperand(1);
4014  SDValue CC = Op.getOperand(2);
4015  EVT VT = Op.getValueType();
4016  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
4017  SDLoc dl(Op);
4018
4019  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
4020    switch (SetCCOpcode) {
4021    default: llvm_unreachable("Illegal FP comparison");
4022    case ISD::SETUNE:
4023    case ISD::SETNE:  Invert = true; // Fallthrough
4024    case ISD::SETOEQ:
4025    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4026    case ISD::SETOLT:
4027    case ISD::SETLT: Swap = true; // Fallthrough
4028    case ISD::SETOGT:
4029    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4030    case ISD::SETOLE:
4031    case ISD::SETLE:  Swap = true; // Fallthrough
4032    case ISD::SETOGE:
4033    case ISD::SETGE: Opc = ARMISD::VCGE; break;
4034    case ISD::SETUGE: Swap = true; // Fallthrough
4035    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
4036    case ISD::SETUGT: Swap = true; // Fallthrough
4037    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
4038    case ISD::SETUEQ: Invert = true; // Fallthrough
4039    case ISD::SETONE:
4040      // Expand this to (OLT | OGT).
4041      TmpOp0 = Op0;
4042      TmpOp1 = Op1;
4043      Opc = ISD::OR;
4044      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
4045      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
4046      break;
4047    case ISD::SETUO: Invert = true; // Fallthrough
4048    case ISD::SETO:
4049      // Expand this to (OLT | OGE).
4050      TmpOp0 = Op0;
4051      TmpOp1 = Op1;
4052      Opc = ISD::OR;
4053      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
4054      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
4055      break;
4056    }
4057  } else {
4058    // Integer comparisons.
4059    switch (SetCCOpcode) {
4060    default: llvm_unreachable("Illegal integer comparison");
4061    case ISD::SETNE:  Invert = true;
4062    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
4063    case ISD::SETLT:  Swap = true;
4064    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
4065    case ISD::SETLE:  Swap = true;
4066    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
4067    case ISD::SETULT: Swap = true;
4068    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
4069    case ISD::SETULE: Swap = true;
4070    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
4071    }
4072
4073    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
4074    if (Opc == ARMISD::VCEQ) {
4075
4076      SDValue AndOp;
4077      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4078        AndOp = Op0;
4079      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
4080        AndOp = Op1;
4081
4082      // Ignore bitconvert.
4083      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
4084        AndOp = AndOp.getOperand(0);
4085
4086      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
4087        Opc = ARMISD::VTST;
4088        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
4089        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
4090        Invert = !Invert;
4091      }
4092    }
4093  }
4094
4095  if (Swap)
4096    std::swap(Op0, Op1);
4097
4098  // If one of the operands is a constant vector zero, attempt to fold the
4099  // comparison to a specialized compare-against-zero form.
4100  SDValue SingleOp;
4101  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
4102    SingleOp = Op0;
4103  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
4104    if (Opc == ARMISD::VCGE)
4105      Opc = ARMISD::VCLEZ;
4106    else if (Opc == ARMISD::VCGT)
4107      Opc = ARMISD::VCLTZ;
4108    SingleOp = Op1;
4109  }
4110
4111  SDValue Result;
4112  if (SingleOp.getNode()) {
4113    switch (Opc) {
4114    case ARMISD::VCEQ:
4115      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
4116    case ARMISD::VCGE:
4117      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
4118    case ARMISD::VCLEZ:
4119      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
4120    case ARMISD::VCGT:
4121      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
4122    case ARMISD::VCLTZ:
4123      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
4124    default:
4125      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
4126    }
4127  } else {
4128     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
4129  }
4130
4131  if (Invert)
4132    Result = DAG.getNOT(dl, Result, VT);
4133
4134  return Result;
4135}
4136
4137/// isNEONModifiedImm - Check if the specified splat value corresponds to a
4138/// valid vector constant for a NEON instruction with a "modified immediate"
4139/// operand (e.g., VMOV).  If so, return the encoded value.
4140static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
4141                                 unsigned SplatBitSize, SelectionDAG &DAG,
4142                                 EVT &VT, bool is128Bits, NEONModImmType type) {
4143  unsigned OpCmode, Imm;
4144
4145  // SplatBitSize is set to the smallest size that splats the vector, so a
4146  // zero vector will always have SplatBitSize == 8.  However, NEON modified
4147  // immediate instructions others than VMOV do not support the 8-bit encoding
4148  // of a zero vector, and the default encoding of zero is supposed to be the
4149  // 32-bit version.
4150  if (SplatBits == 0)
4151    SplatBitSize = 32;
4152
4153  switch (SplatBitSize) {
4154  case 8:
4155    if (type != VMOVModImm)
4156      return SDValue();
4157    // Any 1-byte value is OK.  Op=0, Cmode=1110.
4158    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
4159    OpCmode = 0xe;
4160    Imm = SplatBits;
4161    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
4162    break;
4163
4164  case 16:
4165    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
4166    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
4167    if ((SplatBits & ~0xff) == 0) {
4168      // Value = 0x00nn: Op=x, Cmode=100x.
4169      OpCmode = 0x8;
4170      Imm = SplatBits;
4171      break;
4172    }
4173    if ((SplatBits & ~0xff00) == 0) {
4174      // Value = 0xnn00: Op=x, Cmode=101x.
4175      OpCmode = 0xa;
4176      Imm = SplatBits >> 8;
4177      break;
4178    }
4179    return SDValue();
4180
4181  case 32:
4182    // NEON's 32-bit VMOV supports splat values where:
4183    // * only one byte is nonzero, or
4184    // * the least significant byte is 0xff and the second byte is nonzero, or
4185    // * the least significant 2 bytes are 0xff and the third is nonzero.
4186    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
4187    if ((SplatBits & ~0xff) == 0) {
4188      // Value = 0x000000nn: Op=x, Cmode=000x.
4189      OpCmode = 0;
4190      Imm = SplatBits;
4191      break;
4192    }
4193    if ((SplatBits & ~0xff00) == 0) {
4194      // Value = 0x0000nn00: Op=x, Cmode=001x.
4195      OpCmode = 0x2;
4196      Imm = SplatBits >> 8;
4197      break;
4198    }
4199    if ((SplatBits & ~0xff0000) == 0) {
4200      // Value = 0x00nn0000: Op=x, Cmode=010x.
4201      OpCmode = 0x4;
4202      Imm = SplatBits >> 16;
4203      break;
4204    }
4205    if ((SplatBits & ~0xff000000) == 0) {
4206      // Value = 0xnn000000: Op=x, Cmode=011x.
4207      OpCmode = 0x6;
4208      Imm = SplatBits >> 24;
4209      break;
4210    }
4211
4212    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
4213    if (type == OtherModImm) return SDValue();
4214
4215    if ((SplatBits & ~0xffff) == 0 &&
4216        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
4217      // Value = 0x0000nnff: Op=x, Cmode=1100.
4218      OpCmode = 0xc;
4219      Imm = SplatBits >> 8;
4220      SplatBits |= 0xff;
4221      break;
4222    }
4223
4224    if ((SplatBits & ~0xffffff) == 0 &&
4225        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
4226      // Value = 0x00nnffff: Op=x, Cmode=1101.
4227      OpCmode = 0xd;
4228      Imm = SplatBits >> 16;
4229      SplatBits |= 0xffff;
4230      break;
4231    }
4232
4233    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
4234    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
4235    // VMOV.I32.  A (very) minor optimization would be to replicate the value
4236    // and fall through here to test for a valid 64-bit splat.  But, then the
4237    // caller would also need to check and handle the change in size.
4238    return SDValue();
4239
4240  case 64: {
4241    if (type != VMOVModImm)
4242      return SDValue();
4243    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
4244    uint64_t BitMask = 0xff;
4245    uint64_t Val = 0;
4246    unsigned ImmMask = 1;
4247    Imm = 0;
4248    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
4249      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
4250        Val |= BitMask;
4251        Imm |= ImmMask;
4252      } else if ((SplatBits & BitMask) != 0) {
4253        return SDValue();
4254      }
4255      BitMask <<= 8;
4256      ImmMask <<= 1;
4257    }
4258    // Op=1, Cmode=1110.
4259    OpCmode = 0x1e;
4260    SplatBits = Val;
4261    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
4262    break;
4263  }
4264
4265  default:
4266    llvm_unreachable("unexpected size for isNEONModifiedImm");
4267  }
4268
4269  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
4270  return DAG.getTargetConstant(EncodedVal, MVT::i32);
4271}
4272
4273SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
4274                                           const ARMSubtarget *ST) const {
4275  if (!ST->hasVFP3())
4276    return SDValue();
4277
4278  bool IsDouble = Op.getValueType() == MVT::f64;
4279  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
4280
4281  // Try splatting with a VMOV.f32...
4282  APFloat FPVal = CFP->getValueAPF();
4283  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
4284
4285  if (ImmVal != -1) {
4286    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
4287      // We have code in place to select a valid ConstantFP already, no need to
4288      // do any mangling.
4289      return Op;
4290    }
4291
4292    // It's a float and we are trying to use NEON operations where
4293    // possible. Lower it to a splat followed by an extract.
4294    SDLoc DL(Op);
4295    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
4296    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
4297                                      NewVal);
4298    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
4299                       DAG.getConstant(0, MVT::i32));
4300  }
4301
4302  // The rest of our options are NEON only, make sure that's allowed before
4303  // proceeding..
4304  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
4305    return SDValue();
4306
4307  EVT VMovVT;
4308  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
4309
4310  // It wouldn't really be worth bothering for doubles except for one very
4311  // important value, which does happen to match: 0.0. So make sure we don't do
4312  // anything stupid.
4313  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
4314    return SDValue();
4315
4316  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
4317  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
4318                                     false, VMOVModImm);
4319  if (NewVal != SDValue()) {
4320    SDLoc DL(Op);
4321    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
4322                                      NewVal);
4323    if (IsDouble)
4324      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4325
4326    // It's a float: cast and extract a vector element.
4327    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4328                                       VecConstant);
4329    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4330                       DAG.getConstant(0, MVT::i32));
4331  }
4332
4333  // Finally, try a VMVN.i32
4334  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
4335                             false, VMVNModImm);
4336  if (NewVal != SDValue()) {
4337    SDLoc DL(Op);
4338    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
4339
4340    if (IsDouble)
4341      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
4342
4343    // It's a float: cast and extract a vector element.
4344    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
4345                                       VecConstant);
4346    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
4347                       DAG.getConstant(0, MVT::i32));
4348  }
4349
4350  return SDValue();
4351}
4352
4353// check if an VEXT instruction can handle the shuffle mask when the
4354// vector sources of the shuffle are the same.
4355static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
4356  unsigned NumElts = VT.getVectorNumElements();
4357
4358  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
4359  if (M[0] < 0)
4360    return false;
4361
4362  Imm = M[0];
4363
4364  // If this is a VEXT shuffle, the immediate value is the index of the first
4365  // element.  The other shuffle indices must be the successive elements after
4366  // the first one.
4367  unsigned ExpectedElt = Imm;
4368  for (unsigned i = 1; i < NumElts; ++i) {
4369    // Increment the expected index.  If it wraps around, just follow it
4370    // back to index zero and keep going.
4371    ++ExpectedElt;
4372    if (ExpectedElt == NumElts)
4373      ExpectedElt = 0;
4374
4375    if (M[i] < 0) continue; // ignore UNDEF indices
4376    if (ExpectedElt != static_cast<unsigned>(M[i]))
4377      return false;
4378  }
4379
4380  return true;
4381}
4382
4383
4384static bool isVEXTMask(ArrayRef<int> M, EVT VT,
4385                       bool &ReverseVEXT, unsigned &Imm) {
4386  unsigned NumElts = VT.getVectorNumElements();
4387  ReverseVEXT = false;
4388
4389  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
4390  if (M[0] < 0)
4391    return false;
4392
4393  Imm = M[0];
4394
4395  // If this is a VEXT shuffle, the immediate value is the index of the first
4396  // element.  The other shuffle indices must be the successive elements after
4397  // the first one.
4398  unsigned ExpectedElt = Imm;
4399  for (unsigned i = 1; i < NumElts; ++i) {
4400    // Increment the expected index.  If it wraps around, it may still be
4401    // a VEXT but the source vectors must be swapped.
4402    ExpectedElt += 1;
4403    if (ExpectedElt == NumElts * 2) {
4404      ExpectedElt = 0;
4405      ReverseVEXT = true;
4406    }
4407
4408    if (M[i] < 0) continue; // ignore UNDEF indices
4409    if (ExpectedElt != static_cast<unsigned>(M[i]))
4410      return false;
4411  }
4412
4413  // Adjust the index value if the source operands will be swapped.
4414  if (ReverseVEXT)
4415    Imm -= NumElts;
4416
4417  return true;
4418}
4419
4420/// isVREVMask - Check if a vector shuffle corresponds to a VREV
4421/// instruction with the specified blocksize.  (The order of the elements
4422/// within each block of the vector is reversed.)
4423static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
4424  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
4425         "Only possible block sizes for VREV are: 16, 32, 64");
4426
4427  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4428  if (EltSz == 64)
4429    return false;
4430
4431  unsigned NumElts = VT.getVectorNumElements();
4432  unsigned BlockElts = M[0] + 1;
4433  // If the first shuffle index is UNDEF, be optimistic.
4434  if (M[0] < 0)
4435    BlockElts = BlockSize / EltSz;
4436
4437  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
4438    return false;
4439
4440  for (unsigned i = 0; i < NumElts; ++i) {
4441    if (M[i] < 0) continue; // ignore UNDEF indices
4442    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
4443      return false;
4444  }
4445
4446  return true;
4447}
4448
4449static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
4450  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
4451  // range, then 0 is placed into the resulting vector. So pretty much any mask
4452  // of 8 elements can work here.
4453  return VT == MVT::v8i8 && M.size() == 8;
4454}
4455
4456static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4457  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4458  if (EltSz == 64)
4459    return false;
4460
4461  unsigned NumElts = VT.getVectorNumElements();
4462  WhichResult = (M[0] == 0 ? 0 : 1);
4463  for (unsigned i = 0; i < NumElts; i += 2) {
4464    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
4465        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
4466      return false;
4467  }
4468  return true;
4469}
4470
4471/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
4472/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4473/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
4474static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4475  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4476  if (EltSz == 64)
4477    return false;
4478
4479  unsigned NumElts = VT.getVectorNumElements();
4480  WhichResult = (M[0] == 0 ? 0 : 1);
4481  for (unsigned i = 0; i < NumElts; i += 2) {
4482    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
4483        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
4484      return false;
4485  }
4486  return true;
4487}
4488
4489static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4490  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4491  if (EltSz == 64)
4492    return false;
4493
4494  unsigned NumElts = VT.getVectorNumElements();
4495  WhichResult = (M[0] == 0 ? 0 : 1);
4496  for (unsigned i = 0; i != NumElts; ++i) {
4497    if (M[i] < 0) continue; // ignore UNDEF indices
4498    if ((unsigned) M[i] != 2 * i + WhichResult)
4499      return false;
4500  }
4501
4502  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4503  if (VT.is64BitVector() && EltSz == 32)
4504    return false;
4505
4506  return true;
4507}
4508
4509/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
4510/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4511/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
4512static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4513  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4514  if (EltSz == 64)
4515    return false;
4516
4517  unsigned Half = VT.getVectorNumElements() / 2;
4518  WhichResult = (M[0] == 0 ? 0 : 1);
4519  for (unsigned j = 0; j != 2; ++j) {
4520    unsigned Idx = WhichResult;
4521    for (unsigned i = 0; i != Half; ++i) {
4522      int MIdx = M[i + j * Half];
4523      if (MIdx >= 0 && (unsigned) MIdx != Idx)
4524        return false;
4525      Idx += 2;
4526    }
4527  }
4528
4529  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4530  if (VT.is64BitVector() && EltSz == 32)
4531    return false;
4532
4533  return true;
4534}
4535
4536static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4537  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4538  if (EltSz == 64)
4539    return false;
4540
4541  unsigned NumElts = VT.getVectorNumElements();
4542  WhichResult = (M[0] == 0 ? 0 : 1);
4543  unsigned Idx = WhichResult * NumElts / 2;
4544  for (unsigned i = 0; i != NumElts; i += 2) {
4545    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4546        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
4547      return false;
4548    Idx += 1;
4549  }
4550
4551  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4552  if (VT.is64BitVector() && EltSz == 32)
4553    return false;
4554
4555  return true;
4556}
4557
4558/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
4559/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4560/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
4561static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4562  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4563  if (EltSz == 64)
4564    return false;
4565
4566  unsigned NumElts = VT.getVectorNumElements();
4567  WhichResult = (M[0] == 0 ? 0 : 1);
4568  unsigned Idx = WhichResult * NumElts / 2;
4569  for (unsigned i = 0; i != NumElts; i += 2) {
4570    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4571        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
4572      return false;
4573    Idx += 1;
4574  }
4575
4576  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4577  if (VT.is64BitVector() && EltSz == 32)
4578    return false;
4579
4580  return true;
4581}
4582
4583/// \return true if this is a reverse operation on an vector.
4584static bool isReverseMask(ArrayRef<int> M, EVT VT) {
4585  unsigned NumElts = VT.getVectorNumElements();
4586  // Make sure the mask has the right size.
4587  if (NumElts != M.size())
4588      return false;
4589
4590  // Look for <15, ..., 3, -1, 1, 0>.
4591  for (unsigned i = 0; i != NumElts; ++i)
4592    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
4593      return false;
4594
4595  return true;
4596}
4597
4598// If N is an integer constant that can be moved into a register in one
4599// instruction, return an SDValue of such a constant (will become a MOV
4600// instruction).  Otherwise return null.
4601static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
4602                                     const ARMSubtarget *ST, SDLoc dl) {
4603  uint64_t Val;
4604  if (!isa<ConstantSDNode>(N))
4605    return SDValue();
4606  Val = cast<ConstantSDNode>(N)->getZExtValue();
4607
4608  if (ST->isThumb1Only()) {
4609    if (Val <= 255 || ~Val <= 255)
4610      return DAG.getConstant(Val, MVT::i32);
4611  } else {
4612    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
4613      return DAG.getConstant(Val, MVT::i32);
4614  }
4615  return SDValue();
4616}
4617
4618// If this is a case we can't handle, return null and let the default
4619// expansion code take care of it.
4620SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
4621                                             const ARMSubtarget *ST) const {
4622  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
4623  SDLoc dl(Op);
4624  EVT VT = Op.getValueType();
4625
4626  APInt SplatBits, SplatUndef;
4627  unsigned SplatBitSize;
4628  bool HasAnyUndefs;
4629  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4630    if (SplatBitSize <= 64) {
4631      // Check if an immediate VMOV works.
4632      EVT VmovVT;
4633      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
4634                                      SplatUndef.getZExtValue(), SplatBitSize,
4635                                      DAG, VmovVT, VT.is128BitVector(),
4636                                      VMOVModImm);
4637      if (Val.getNode()) {
4638        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
4639        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4640      }
4641
4642      // Try an immediate VMVN.
4643      uint64_t NegatedImm = (~SplatBits).getZExtValue();
4644      Val = isNEONModifiedImm(NegatedImm,
4645                                      SplatUndef.getZExtValue(), SplatBitSize,
4646                                      DAG, VmovVT, VT.is128BitVector(),
4647                                      VMVNModImm);
4648      if (Val.getNode()) {
4649        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
4650        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4651      }
4652
4653      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
4654      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
4655        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
4656        if (ImmVal != -1) {
4657          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
4658          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
4659        }
4660      }
4661    }
4662  }
4663
4664  // Scan through the operands to see if only one value is used.
4665  //
4666  // As an optimisation, even if more than one value is used it may be more
4667  // profitable to splat with one value then change some lanes.
4668  //
4669  // Heuristically we decide to do this if the vector has a "dominant" value,
4670  // defined as splatted to more than half of the lanes.
4671  unsigned NumElts = VT.getVectorNumElements();
4672  bool isOnlyLowElement = true;
4673  bool usesOnlyOneValue = true;
4674  bool hasDominantValue = false;
4675  bool isConstant = true;
4676
4677  // Map of the number of times a particular SDValue appears in the
4678  // element list.
4679  DenseMap<SDValue, unsigned> ValueCounts;
4680  SDValue Value;
4681  for (unsigned i = 0; i < NumElts; ++i) {
4682    SDValue V = Op.getOperand(i);
4683    if (V.getOpcode() == ISD::UNDEF)
4684      continue;
4685    if (i > 0)
4686      isOnlyLowElement = false;
4687    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
4688      isConstant = false;
4689
4690    ValueCounts.insert(std::make_pair(V, 0));
4691    unsigned &Count = ValueCounts[V];
4692
4693    // Is this value dominant? (takes up more than half of the lanes)
4694    if (++Count > (NumElts / 2)) {
4695      hasDominantValue = true;
4696      Value = V;
4697    }
4698  }
4699  if (ValueCounts.size() != 1)
4700    usesOnlyOneValue = false;
4701  if (!Value.getNode() && ValueCounts.size() > 0)
4702    Value = ValueCounts.begin()->first;
4703
4704  if (ValueCounts.size() == 0)
4705    return DAG.getUNDEF(VT);
4706
4707  // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
4708  // Keep going if we are hitting this case.
4709  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
4710    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
4711
4712  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4713
4714  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
4715  // i32 and try again.
4716  if (hasDominantValue && EltSize <= 32) {
4717    if (!isConstant) {
4718      SDValue N;
4719
4720      // If we are VDUPing a value that comes directly from a vector, that will
4721      // cause an unnecessary move to and from a GPR, where instead we could
4722      // just use VDUPLANE. We can only do this if the lane being extracted
4723      // is at a constant index, as the VDUP from lane instructions only have
4724      // constant-index forms.
4725      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4726          isa<ConstantSDNode>(Value->getOperand(1))) {
4727        // We need to create a new undef vector to use for the VDUPLANE if the
4728        // size of the vector from which we get the value is different than the
4729        // size of the vector that we need to create. We will insert the element
4730        // such that the register coalescer will remove unnecessary copies.
4731        if (VT != Value->getOperand(0).getValueType()) {
4732          ConstantSDNode *constIndex;
4733          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
4734          assert(constIndex && "The index is not a constant!");
4735          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
4736                             VT.getVectorNumElements();
4737          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4738                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
4739                        Value, DAG.getConstant(index, MVT::i32)),
4740                           DAG.getConstant(index, MVT::i32));
4741        } else
4742          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4743                        Value->getOperand(0), Value->getOperand(1));
4744      } else
4745        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
4746
4747      if (!usesOnlyOneValue) {
4748        // The dominant value was splatted as 'N', but we now have to insert
4749        // all differing elements.
4750        for (unsigned I = 0; I < NumElts; ++I) {
4751          if (Op.getOperand(I) == Value)
4752            continue;
4753          SmallVector<SDValue, 3> Ops;
4754          Ops.push_back(N);
4755          Ops.push_back(Op.getOperand(I));
4756          Ops.push_back(DAG.getConstant(I, MVT::i32));
4757          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
4758        }
4759      }
4760      return N;
4761    }
4762    if (VT.getVectorElementType().isFloatingPoint()) {
4763      SmallVector<SDValue, 8> Ops;
4764      for (unsigned i = 0; i < NumElts; ++i)
4765        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4766                                  Op.getOperand(i)));
4767      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
4768      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
4769      Val = LowerBUILD_VECTOR(Val, DAG, ST);
4770      if (Val.getNode())
4771        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4772    }
4773    if (usesOnlyOneValue) {
4774      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
4775      if (isConstant && Val.getNode())
4776        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
4777    }
4778  }
4779
4780  // If all elements are constants and the case above didn't get hit, fall back
4781  // to the default expansion, which will generate a load from the constant
4782  // pool.
4783  if (isConstant)
4784    return SDValue();
4785
4786  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
4787  if (NumElts >= 4) {
4788    SDValue shuffle = ReconstructShuffle(Op, DAG);
4789    if (shuffle != SDValue())
4790      return shuffle;
4791  }
4792
4793  // Vectors with 32- or 64-bit elements can be built by directly assigning
4794  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
4795  // will be legalized.
4796  if (EltSize >= 32) {
4797    // Do the expansion with floating-point types, since that is what the VFP
4798    // registers are defined to use, and since i64 is not legal.
4799    EVT EltVT = EVT::getFloatingPointVT(EltSize);
4800    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
4801    SmallVector<SDValue, 8> Ops;
4802    for (unsigned i = 0; i < NumElts; ++i)
4803      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
4804    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
4805    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4806  }
4807
4808  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
4809  // know the default expansion would otherwise fall back on something even
4810  // worse. For a vector with one or two non-undef values, that's
4811  // scalar_to_vector for the elements followed by a shuffle (provided the
4812  // shuffle is valid for the target) and materialization element by element
4813  // on the stack followed by a load for everything else.
4814  if (!isConstant && !usesOnlyOneValue) {
4815    SDValue Vec = DAG.getUNDEF(VT);
4816    for (unsigned i = 0 ; i < NumElts; ++i) {
4817      SDValue V = Op.getOperand(i);
4818      if (V.getOpcode() == ISD::UNDEF)
4819        continue;
4820      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
4821      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
4822    }
4823    return Vec;
4824  }
4825
4826  return SDValue();
4827}
4828
4829// Gather data to see if the operation can be modelled as a
4830// shuffle in combination with VEXTs.
4831SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
4832                                              SelectionDAG &DAG) const {
4833  SDLoc dl(Op);
4834  EVT VT = Op.getValueType();
4835  unsigned NumElts = VT.getVectorNumElements();
4836
4837  SmallVector<SDValue, 2> SourceVecs;
4838  SmallVector<unsigned, 2> MinElts;
4839  SmallVector<unsigned, 2> MaxElts;
4840
4841  for (unsigned i = 0; i < NumElts; ++i) {
4842    SDValue V = Op.getOperand(i);
4843    if (V.getOpcode() == ISD::UNDEF)
4844      continue;
4845    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
4846      // A shuffle can only come from building a vector from various
4847      // elements of other vectors.
4848      return SDValue();
4849    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
4850               VT.getVectorElementType()) {
4851      // This code doesn't know how to handle shuffles where the vector
4852      // element types do not match (this happens because type legalization
4853      // promotes the return type of EXTRACT_VECTOR_ELT).
4854      // FIXME: It might be appropriate to extend this code to handle
4855      // mismatched types.
4856      return SDValue();
4857    }
4858
4859    // Record this extraction against the appropriate vector if possible...
4860    SDValue SourceVec = V.getOperand(0);
4861    // If the element number isn't a constant, we can't effectively
4862    // analyze what's going on.
4863    if (!isa<ConstantSDNode>(V.getOperand(1)))
4864      return SDValue();
4865    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
4866    bool FoundSource = false;
4867    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
4868      if (SourceVecs[j] == SourceVec) {
4869        if (MinElts[j] > EltNo)
4870          MinElts[j] = EltNo;
4871        if (MaxElts[j] < EltNo)
4872          MaxElts[j] = EltNo;
4873        FoundSource = true;
4874        break;
4875      }
4876    }
4877
4878    // Or record a new source if not...
4879    if (!FoundSource) {
4880      SourceVecs.push_back(SourceVec);
4881      MinElts.push_back(EltNo);
4882      MaxElts.push_back(EltNo);
4883    }
4884  }
4885
4886  // Currently only do something sane when at most two source vectors
4887  // involved.
4888  if (SourceVecs.size() > 2)
4889    return SDValue();
4890
4891  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
4892  int VEXTOffsets[2] = {0, 0};
4893
4894  // This loop extracts the usage patterns of the source vectors
4895  // and prepares appropriate SDValues for a shuffle if possible.
4896  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
4897    if (SourceVecs[i].getValueType() == VT) {
4898      // No VEXT necessary
4899      ShuffleSrcs[i] = SourceVecs[i];
4900      VEXTOffsets[i] = 0;
4901      continue;
4902    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
4903      // It probably isn't worth padding out a smaller vector just to
4904      // break it down again in a shuffle.
4905      return SDValue();
4906    }
4907
4908    // Since only 64-bit and 128-bit vectors are legal on ARM and
4909    // we've eliminated the other cases...
4910    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
4911           "unexpected vector sizes in ReconstructShuffle");
4912
4913    if (MaxElts[i] - MinElts[i] >= NumElts) {
4914      // Span too large for a VEXT to cope
4915      return SDValue();
4916    }
4917
4918    if (MinElts[i] >= NumElts) {
4919      // The extraction can just take the second half
4920      VEXTOffsets[i] = NumElts;
4921      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4922                                   SourceVecs[i],
4923                                   DAG.getIntPtrConstant(NumElts));
4924    } else if (MaxElts[i] < NumElts) {
4925      // The extraction can just take the first half
4926      VEXTOffsets[i] = 0;
4927      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4928                                   SourceVecs[i],
4929                                   DAG.getIntPtrConstant(0));
4930    } else {
4931      // An actual VEXT is needed
4932      VEXTOffsets[i] = MinElts[i];
4933      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4934                                     SourceVecs[i],
4935                                     DAG.getIntPtrConstant(0));
4936      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4937                                     SourceVecs[i],
4938                                     DAG.getIntPtrConstant(NumElts));
4939      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
4940                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
4941    }
4942  }
4943
4944  SmallVector<int, 8> Mask;
4945
4946  for (unsigned i = 0; i < NumElts; ++i) {
4947    SDValue Entry = Op.getOperand(i);
4948    if (Entry.getOpcode() == ISD::UNDEF) {
4949      Mask.push_back(-1);
4950      continue;
4951    }
4952
4953    SDValue ExtractVec = Entry.getOperand(0);
4954    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
4955                                          .getOperand(1))->getSExtValue();
4956    if (ExtractVec == SourceVecs[0]) {
4957      Mask.push_back(ExtractElt - VEXTOffsets[0]);
4958    } else {
4959      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
4960    }
4961  }
4962
4963  // Final check before we try to produce nonsense...
4964  if (isShuffleMaskLegal(Mask, VT))
4965    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
4966                                &Mask[0]);
4967
4968  return SDValue();
4969}
4970
4971/// isShuffleMaskLegal - Targets can use this to indicate that they only
4972/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
4973/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
4974/// are assumed to be legal.
4975bool
4976ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
4977                                      EVT VT) const {
4978  if (VT.getVectorNumElements() == 4 &&
4979      (VT.is128BitVector() || VT.is64BitVector())) {
4980    unsigned PFIndexes[4];
4981    for (unsigned i = 0; i != 4; ++i) {
4982      if (M[i] < 0)
4983        PFIndexes[i] = 8;
4984      else
4985        PFIndexes[i] = M[i];
4986    }
4987
4988    // Compute the index in the perfect shuffle table.
4989    unsigned PFTableIndex =
4990      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
4991    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4992    unsigned Cost = (PFEntry >> 30);
4993
4994    if (Cost <= 4)
4995      return true;
4996  }
4997
4998  bool ReverseVEXT;
4999  unsigned Imm, WhichResult;
5000
5001  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5002  return (EltSize >= 32 ||
5003          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
5004          isVREVMask(M, VT, 64) ||
5005          isVREVMask(M, VT, 32) ||
5006          isVREVMask(M, VT, 16) ||
5007          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
5008          isVTBLMask(M, VT) ||
5009          isVTRNMask(M, VT, WhichResult) ||
5010          isVUZPMask(M, VT, WhichResult) ||
5011          isVZIPMask(M, VT, WhichResult) ||
5012          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
5013          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
5014          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
5015          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
5016}
5017
5018/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5019/// the specified operations to build the shuffle.
5020static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
5021                                      SDValue RHS, SelectionDAG &DAG,
5022                                      SDLoc dl) {
5023  unsigned OpNum = (PFEntry >> 26) & 0x0F;
5024  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
5025  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
5026
5027  enum {
5028    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5029    OP_VREV,
5030    OP_VDUP0,
5031    OP_VDUP1,
5032    OP_VDUP2,
5033    OP_VDUP3,
5034    OP_VEXT1,
5035    OP_VEXT2,
5036    OP_VEXT3,
5037    OP_VUZPL, // VUZP, left result
5038    OP_VUZPR, // VUZP, right result
5039    OP_VZIPL, // VZIP, left result
5040    OP_VZIPR, // VZIP, right result
5041    OP_VTRNL, // VTRN, left result
5042    OP_VTRNR  // VTRN, right result
5043  };
5044
5045  if (OpNum == OP_COPY) {
5046    if (LHSID == (1*9+2)*9+3) return LHS;
5047    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
5048    return RHS;
5049  }
5050
5051  SDValue OpLHS, OpRHS;
5052  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
5053  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
5054  EVT VT = OpLHS.getValueType();
5055
5056  switch (OpNum) {
5057  default: llvm_unreachable("Unknown shuffle opcode!");
5058  case OP_VREV:
5059    // VREV divides the vector in half and swaps within the half.
5060    if (VT.getVectorElementType() == MVT::i32 ||
5061        VT.getVectorElementType() == MVT::f32)
5062      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
5063    // vrev <4 x i16> -> VREV32
5064    if (VT.getVectorElementType() == MVT::i16)
5065      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
5066    // vrev <4 x i8> -> VREV16
5067    assert(VT.getVectorElementType() == MVT::i8);
5068    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
5069  case OP_VDUP0:
5070  case OP_VDUP1:
5071  case OP_VDUP2:
5072  case OP_VDUP3:
5073    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
5074                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
5075  case OP_VEXT1:
5076  case OP_VEXT2:
5077  case OP_VEXT3:
5078    return DAG.getNode(ARMISD::VEXT, dl, VT,
5079                       OpLHS, OpRHS,
5080                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
5081  case OP_VUZPL:
5082  case OP_VUZPR:
5083    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5084                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
5085  case OP_VZIPL:
5086  case OP_VZIPR:
5087    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5088                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
5089  case OP_VTRNL:
5090  case OP_VTRNR:
5091    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5092                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
5093  }
5094}
5095
5096static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
5097                                       ArrayRef<int> ShuffleMask,
5098                                       SelectionDAG &DAG) {
5099  // Check to see if we can use the VTBL instruction.
5100  SDValue V1 = Op.getOperand(0);
5101  SDValue V2 = Op.getOperand(1);
5102  SDLoc DL(Op);
5103
5104  SmallVector<SDValue, 8> VTBLMask;
5105  for (ArrayRef<int>::iterator
5106         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
5107    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
5108
5109  if (V2.getNode()->getOpcode() == ISD::UNDEF)
5110    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
5111                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
5112                                   &VTBLMask[0], 8));
5113
5114  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
5115                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
5116                                 &VTBLMask[0], 8));
5117}
5118
5119static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
5120                                                      SelectionDAG &DAG) {
5121  SDLoc DL(Op);
5122  SDValue OpLHS = Op.getOperand(0);
5123  EVT VT = OpLHS.getValueType();
5124
5125  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
5126         "Expect an v8i16/v16i8 type");
5127  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
5128  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
5129  // extract the first 8 bytes into the top double word and the last 8 bytes
5130  // into the bottom double word. The v8i16 case is similar.
5131  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
5132  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
5133                     DAG.getConstant(ExtractNum, MVT::i32));
5134}
5135
5136static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
5137  SDValue V1 = Op.getOperand(0);
5138  SDValue V2 = Op.getOperand(1);
5139  SDLoc dl(Op);
5140  EVT VT = Op.getValueType();
5141  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
5142
5143  // Convert shuffles that are directly supported on NEON to target-specific
5144  // DAG nodes, instead of keeping them as shuffles and matching them again
5145  // during code selection.  This is more efficient and avoids the possibility
5146  // of inconsistencies between legalization and selection.
5147  // FIXME: floating-point vectors should be canonicalized to integer vectors
5148  // of the same time so that they get CSEd properly.
5149  ArrayRef<int> ShuffleMask = SVN->getMask();
5150
5151  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5152  if (EltSize <= 32) {
5153    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
5154      int Lane = SVN->getSplatIndex();
5155      // If this is undef splat, generate it via "just" vdup, if possible.
5156      if (Lane == -1) Lane = 0;
5157
5158      // Test if V1 is a SCALAR_TO_VECTOR.
5159      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5160        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5161      }
5162      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
5163      // (and probably will turn into a SCALAR_TO_VECTOR once legalization
5164      // reaches it).
5165      if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
5166          !isa<ConstantSDNode>(V1.getOperand(0))) {
5167        bool IsScalarToVector = true;
5168        for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
5169          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
5170            IsScalarToVector = false;
5171            break;
5172          }
5173        if (IsScalarToVector)
5174          return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
5175      }
5176      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
5177                         DAG.getConstant(Lane, MVT::i32));
5178    }
5179
5180    bool ReverseVEXT;
5181    unsigned Imm;
5182    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
5183      if (ReverseVEXT)
5184        std::swap(V1, V2);
5185      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
5186                         DAG.getConstant(Imm, MVT::i32));
5187    }
5188
5189    if (isVREVMask(ShuffleMask, VT, 64))
5190      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
5191    if (isVREVMask(ShuffleMask, VT, 32))
5192      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
5193    if (isVREVMask(ShuffleMask, VT, 16))
5194      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
5195
5196    if (V2->getOpcode() == ISD::UNDEF &&
5197        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
5198      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
5199                         DAG.getConstant(Imm, MVT::i32));
5200    }
5201
5202    // Check for Neon shuffles that modify both input vectors in place.
5203    // If both results are used, i.e., if there are two shuffles with the same
5204    // source operands and with masks corresponding to both results of one of
5205    // these operations, DAG memoization will ensure that a single node is
5206    // used for both shuffles.
5207    unsigned WhichResult;
5208    if (isVTRNMask(ShuffleMask, VT, WhichResult))
5209      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5210                         V1, V2).getValue(WhichResult);
5211    if (isVUZPMask(ShuffleMask, VT, WhichResult))
5212      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5213                         V1, V2).getValue(WhichResult);
5214    if (isVZIPMask(ShuffleMask, VT, WhichResult))
5215      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5216                         V1, V2).getValue(WhichResult);
5217
5218    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
5219      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
5220                         V1, V1).getValue(WhichResult);
5221    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5222      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
5223                         V1, V1).getValue(WhichResult);
5224    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
5225      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
5226                         V1, V1).getValue(WhichResult);
5227  }
5228
5229  // If the shuffle is not directly supported and it has 4 elements, use
5230  // the PerfectShuffle-generated table to synthesize it from other shuffles.
5231  unsigned NumElts = VT.getVectorNumElements();
5232  if (NumElts == 4) {
5233    unsigned PFIndexes[4];
5234    for (unsigned i = 0; i != 4; ++i) {
5235      if (ShuffleMask[i] < 0)
5236        PFIndexes[i] = 8;
5237      else
5238        PFIndexes[i] = ShuffleMask[i];
5239    }
5240
5241    // Compute the index in the perfect shuffle table.
5242    unsigned PFTableIndex =
5243      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
5244    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
5245    unsigned Cost = (PFEntry >> 30);
5246
5247    if (Cost <= 4)
5248      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
5249  }
5250
5251  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
5252  if (EltSize >= 32) {
5253    // Do the expansion with floating-point types, since that is what the VFP
5254    // registers are defined to use, and since i64 is not legal.
5255    EVT EltVT = EVT::getFloatingPointVT(EltSize);
5256    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
5257    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
5258    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
5259    SmallVector<SDValue, 8> Ops;
5260    for (unsigned i = 0; i < NumElts; ++i) {
5261      if (ShuffleMask[i] < 0)
5262        Ops.push_back(DAG.getUNDEF(EltVT));
5263      else
5264        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
5265                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
5266                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
5267                                                  MVT::i32)));
5268    }
5269    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
5270    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
5271  }
5272
5273  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
5274    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
5275
5276  if (VT == MVT::v8i8) {
5277    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
5278    if (NewOp.getNode())
5279      return NewOp;
5280  }
5281
5282  return SDValue();
5283}
5284
5285static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
5286  // INSERT_VECTOR_ELT is legal only for immediate indexes.
5287  SDValue Lane = Op.getOperand(2);
5288  if (!isa<ConstantSDNode>(Lane))
5289    return SDValue();
5290
5291  return Op;
5292}
5293
5294static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
5295  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
5296  SDValue Lane = Op.getOperand(1);
5297  if (!isa<ConstantSDNode>(Lane))
5298    return SDValue();
5299
5300  SDValue Vec = Op.getOperand(0);
5301  if (Op.getValueType() == MVT::i32 &&
5302      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
5303    SDLoc dl(Op);
5304    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
5305  }
5306
5307  return Op;
5308}
5309
5310static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
5311  // The only time a CONCAT_VECTORS operation can have legal types is when
5312  // two 64-bit vectors are concatenated to a 128-bit vector.
5313  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
5314         "unexpected CONCAT_VECTORS");
5315  SDLoc dl(Op);
5316  SDValue Val = DAG.getUNDEF(MVT::v2f64);
5317  SDValue Op0 = Op.getOperand(0);
5318  SDValue Op1 = Op.getOperand(1);
5319  if (Op0.getOpcode() != ISD::UNDEF)
5320    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
5321                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
5322                      DAG.getIntPtrConstant(0));
5323  if (Op1.getOpcode() != ISD::UNDEF)
5324    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
5325                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
5326                      DAG.getIntPtrConstant(1));
5327  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
5328}
5329
5330/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
5331/// element has been zero/sign-extended, depending on the isSigned parameter,
5332/// from an integer type half its size.
5333static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
5334                                   bool isSigned) {
5335  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
5336  EVT VT = N->getValueType(0);
5337  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
5338    SDNode *BVN = N->getOperand(0).getNode();
5339    if (BVN->getValueType(0) != MVT::v4i32 ||
5340        BVN->getOpcode() != ISD::BUILD_VECTOR)
5341      return false;
5342    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
5343    unsigned HiElt = 1 - LoElt;
5344    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
5345    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
5346    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
5347    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
5348    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
5349      return false;
5350    if (isSigned) {
5351      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
5352          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
5353        return true;
5354    } else {
5355      if (Hi0->isNullValue() && Hi1->isNullValue())
5356        return true;
5357    }
5358    return false;
5359  }
5360
5361  if (N->getOpcode() != ISD::BUILD_VECTOR)
5362    return false;
5363
5364  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
5365    SDNode *Elt = N->getOperand(i).getNode();
5366    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5367      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
5368      unsigned HalfSize = EltSize / 2;
5369      if (isSigned) {
5370        if (!isIntN(HalfSize, C->getSExtValue()))
5371          return false;
5372      } else {
5373        if (!isUIntN(HalfSize, C->getZExtValue()))
5374          return false;
5375      }
5376      continue;
5377    }
5378    return false;
5379  }
5380
5381  return true;
5382}
5383
5384/// isSignExtended - Check if a node is a vector value that is sign-extended
5385/// or a constant BUILD_VECTOR with sign-extended elements.
5386static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
5387  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
5388    return true;
5389  if (isExtendedBUILD_VECTOR(N, DAG, true))
5390    return true;
5391  return false;
5392}
5393
5394/// isZeroExtended - Check if a node is a vector value that is zero-extended
5395/// or a constant BUILD_VECTOR with zero-extended elements.
5396static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
5397  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
5398    return true;
5399  if (isExtendedBUILD_VECTOR(N, DAG, false))
5400    return true;
5401  return false;
5402}
5403
5404static EVT getExtensionTo64Bits(const EVT &OrigVT) {
5405  if (OrigVT.getSizeInBits() >= 64)
5406    return OrigVT;
5407
5408  assert(OrigVT.isSimple() && "Expecting a simple value type");
5409
5410  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
5411  switch (OrigSimpleTy) {
5412  default: llvm_unreachable("Unexpected Vector Type");
5413  case MVT::v2i8:
5414  case MVT::v2i16:
5415     return MVT::v2i32;
5416  case MVT::v4i8:
5417    return  MVT::v4i16;
5418  }
5419}
5420
5421/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
5422/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
5423/// We insert the required extension here to get the vector to fill a D register.
5424static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
5425                                            const EVT &OrigTy,
5426                                            const EVT &ExtTy,
5427                                            unsigned ExtOpcode) {
5428  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
5429  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
5430  // 64-bits we need to insert a new extension so that it will be 64-bits.
5431  assert(ExtTy.is128BitVector() && "Unexpected extension size");
5432  if (OrigTy.getSizeInBits() >= 64)
5433    return N;
5434
5435  // Must extend size to at least 64 bits to be used as an operand for VMULL.
5436  EVT NewVT = getExtensionTo64Bits(OrigTy);
5437
5438  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
5439}
5440
5441/// SkipLoadExtensionForVMULL - return a load of the original vector size that
5442/// does not do any sign/zero extension. If the original vector is less
5443/// than 64 bits, an appropriate extension will be added after the load to
5444/// reach a total size of 64 bits. We have to add the extension separately
5445/// because ARM does not have a sign/zero extending load for vectors.
5446static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
5447  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
5448
5449  // The load already has the right type.
5450  if (ExtendedTy == LD->getMemoryVT())
5451    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
5452                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
5453                LD->isNonTemporal(), LD->isInvariant(),
5454                LD->getAlignment());
5455
5456  // We need to create a zextload/sextload. We cannot just create a load
5457  // followed by a zext/zext node because LowerMUL is also run during normal
5458  // operation legalization where we can't create illegal types.
5459  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
5460                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
5461                        LD->getMemoryVT(), LD->isVolatile(),
5462                        LD->isNonTemporal(), LD->getAlignment());
5463}
5464
5465/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
5466/// extending load, or BUILD_VECTOR with extended elements, return the
5467/// unextended value. The unextended vector should be 64 bits so that it can
5468/// be used as an operand to a VMULL instruction. If the original vector size
5469/// before extension is less than 64 bits we add a an extension to resize
5470/// the vector to 64 bits.
5471static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
5472  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
5473    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
5474                                        N->getOperand(0)->getValueType(0),
5475                                        N->getValueType(0),
5476                                        N->getOpcode());
5477
5478  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
5479    return SkipLoadExtensionForVMULL(LD, DAG);
5480
5481  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
5482  // have been legalized as a BITCAST from v4i32.
5483  if (N->getOpcode() == ISD::BITCAST) {
5484    SDNode *BVN = N->getOperand(0).getNode();
5485    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
5486           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
5487    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
5488    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
5489                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
5490  }
5491  // Construct a new BUILD_VECTOR with elements truncated to half the size.
5492  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5493  EVT VT = N->getValueType(0);
5494  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
5495  unsigned NumElts = VT.getVectorNumElements();
5496  MVT TruncVT = MVT::getIntegerVT(EltSize);
5497  SmallVector<SDValue, 8> Ops;
5498  for (unsigned i = 0; i != NumElts; ++i) {
5499    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
5500    const APInt &CInt = C->getAPIntValue();
5501    // Element types smaller than 32 bits are not legal, so use i32 elements.
5502    // The values are implicitly truncated so sext vs. zext doesn't matter.
5503    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
5504  }
5505  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
5506                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
5507}
5508
5509static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
5510  unsigned Opcode = N->getOpcode();
5511  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5512    SDNode *N0 = N->getOperand(0).getNode();
5513    SDNode *N1 = N->getOperand(1).getNode();
5514    return N0->hasOneUse() && N1->hasOneUse() &&
5515      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5516  }
5517  return false;
5518}
5519
5520static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
5521  unsigned Opcode = N->getOpcode();
5522  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5523    SDNode *N0 = N->getOperand(0).getNode();
5524    SDNode *N1 = N->getOperand(1).getNode();
5525    return N0->hasOneUse() && N1->hasOneUse() &&
5526      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5527  }
5528  return false;
5529}
5530
5531static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
5532  // Multiplications are only custom-lowered for 128-bit vectors so that
5533  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
5534  EVT VT = Op.getValueType();
5535  assert(VT.is128BitVector() && VT.isInteger() &&
5536         "unexpected type for custom-lowering ISD::MUL");
5537  SDNode *N0 = Op.getOperand(0).getNode();
5538  SDNode *N1 = Op.getOperand(1).getNode();
5539  unsigned NewOpc = 0;
5540  bool isMLA = false;
5541  bool isN0SExt = isSignExtended(N0, DAG);
5542  bool isN1SExt = isSignExtended(N1, DAG);
5543  if (isN0SExt && isN1SExt)
5544    NewOpc = ARMISD::VMULLs;
5545  else {
5546    bool isN0ZExt = isZeroExtended(N0, DAG);
5547    bool isN1ZExt = isZeroExtended(N1, DAG);
5548    if (isN0ZExt && isN1ZExt)
5549      NewOpc = ARMISD::VMULLu;
5550    else if (isN1SExt || isN1ZExt) {
5551      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5552      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5553      if (isN1SExt && isAddSubSExt(N0, DAG)) {
5554        NewOpc = ARMISD::VMULLs;
5555        isMLA = true;
5556      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
5557        NewOpc = ARMISD::VMULLu;
5558        isMLA = true;
5559      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
5560        std::swap(N0, N1);
5561        NewOpc = ARMISD::VMULLu;
5562        isMLA = true;
5563      }
5564    }
5565
5566    if (!NewOpc) {
5567      if (VT == MVT::v2i64)
5568        // Fall through to expand this.  It is not legal.
5569        return SDValue();
5570      else
5571        // Other vector multiplications are legal.
5572        return Op;
5573    }
5574  }
5575
5576  // Legalize to a VMULL instruction.
5577  SDLoc DL(Op);
5578  SDValue Op0;
5579  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
5580  if (!isMLA) {
5581    Op0 = SkipExtensionForVMULL(N0, DAG);
5582    assert(Op0.getValueType().is64BitVector() &&
5583           Op1.getValueType().is64BitVector() &&
5584           "unexpected types for extended operands to VMULL");
5585    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
5586  }
5587
5588  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
5589  // isel lowering to take advantage of no-stall back to back vmul + vmla.
5590  //   vmull q0, d4, d6
5591  //   vmlal q0, d5, d6
5592  // is faster than
5593  //   vaddl q0, d4, d5
5594  //   vmovl q1, d6
5595  //   vmul  q0, q0, q1
5596  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
5597  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
5598  EVT Op1VT = Op1.getValueType();
5599  return DAG.getNode(N0->getOpcode(), DL, VT,
5600                     DAG.getNode(NewOpc, DL, VT,
5601                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5602                     DAG.getNode(NewOpc, DL, VT,
5603                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
5604}
5605
5606static SDValue
5607LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
5608  // Convert to float
5609  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
5610  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
5611  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
5612  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
5613  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
5614  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
5615  // Get reciprocal estimate.
5616  // float4 recip = vrecpeq_f32(yf);
5617  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5618                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
5619  // Because char has a smaller range than uchar, we can actually get away
5620  // without any newton steps.  This requires that we use a weird bias
5621  // of 0xb000, however (again, this has been exhaustively tested).
5622  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
5623  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
5624  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
5625  Y = DAG.getConstant(0xb000, MVT::i32);
5626  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
5627  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
5628  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
5629  // Convert back to short.
5630  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
5631  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
5632  return X;
5633}
5634
5635static SDValue
5636LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
5637  SDValue N2;
5638  // Convert to float.
5639  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
5640  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
5641  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
5642  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
5643  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
5644  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
5645
5646  // Use reciprocal estimate and one refinement step.
5647  // float4 recip = vrecpeq_f32(yf);
5648  // recip *= vrecpsq_f32(yf, recip);
5649  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5650                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
5651  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5652                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5653                   N1, N2);
5654  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5655  // Because short has a smaller range than ushort, we can actually get away
5656  // with only a single newton step.  This requires that we use a weird bias
5657  // of 89, however (again, this has been exhaustively tested).
5658  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
5659  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
5660  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
5661  N1 = DAG.getConstant(0x89, MVT::i32);
5662  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
5663  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
5664  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
5665  // Convert back to integer and return.
5666  // return vmovn_s32(vcvt_s32_f32(result));
5667  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
5668  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
5669  return N0;
5670}
5671
5672static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
5673  EVT VT = Op.getValueType();
5674  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
5675         "unexpected type for custom-lowering ISD::SDIV");
5676
5677  SDLoc dl(Op);
5678  SDValue N0 = Op.getOperand(0);
5679  SDValue N1 = Op.getOperand(1);
5680  SDValue N2, N3;
5681
5682  if (VT == MVT::v8i8) {
5683    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
5684    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
5685
5686    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5687                     DAG.getIntPtrConstant(4));
5688    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5689                     DAG.getIntPtrConstant(4));
5690    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5691                     DAG.getIntPtrConstant(0));
5692    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5693                     DAG.getIntPtrConstant(0));
5694
5695    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
5696    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
5697
5698    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5699    N0 = LowerCONCAT_VECTORS(N0, DAG);
5700
5701    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
5702    return N0;
5703  }
5704  return LowerSDIV_v4i16(N0, N1, dl, DAG);
5705}
5706
5707static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
5708  EVT VT = Op.getValueType();
5709  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
5710         "unexpected type for custom-lowering ISD::UDIV");
5711
5712  SDLoc dl(Op);
5713  SDValue N0 = Op.getOperand(0);
5714  SDValue N1 = Op.getOperand(1);
5715  SDValue N2, N3;
5716
5717  if (VT == MVT::v8i8) {
5718    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
5719    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
5720
5721    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5722                     DAG.getIntPtrConstant(4));
5723    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5724                     DAG.getIntPtrConstant(4));
5725    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5726                     DAG.getIntPtrConstant(0));
5727    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5728                     DAG.getIntPtrConstant(0));
5729
5730    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
5731    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
5732
5733    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5734    N0 = LowerCONCAT_VECTORS(N0, DAG);
5735
5736    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
5737                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
5738                     N0);
5739    return N0;
5740  }
5741
5742  // v4i16 sdiv ... Convert to float.
5743  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
5744  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
5745  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
5746  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
5747  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
5748  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
5749
5750  // Use reciprocal estimate and two refinement steps.
5751  // float4 recip = vrecpeq_f32(yf);
5752  // recip *= vrecpsq_f32(yf, recip);
5753  // recip *= vrecpsq_f32(yf, recip);
5754  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5755                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
5756  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5757                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5758                   BN1, N2);
5759  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5760  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5761                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5762                   BN1, N2);
5763  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5764  // Simply multiplying by the reciprocal estimate can leave us a few ulps
5765  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
5766  // and that it will never cause us to return an answer too large).
5767  // float4 result = as_float4(as_int4(xf*recip) + 2);
5768  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
5769  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
5770  N1 = DAG.getConstant(2, MVT::i32);
5771  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
5772  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
5773  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
5774  // Convert back to integer and return.
5775  // return vmovn_u32(vcvt_s32_f32(result));
5776  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
5777  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
5778  return N0;
5779}
5780
5781static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
5782  EVT VT = Op.getNode()->getValueType(0);
5783  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5784
5785  unsigned Opc;
5786  bool ExtraOp = false;
5787  switch (Op.getOpcode()) {
5788  default: llvm_unreachable("Invalid code");
5789  case ISD::ADDC: Opc = ARMISD::ADDC; break;
5790  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
5791  case ISD::SUBC: Opc = ARMISD::SUBC; break;
5792  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
5793  }
5794
5795  if (!ExtraOp)
5796    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
5797                       Op.getOperand(1));
5798  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
5799                     Op.getOperand(1), Op.getOperand(2));
5800}
5801
5802static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
5803  // Monotonic load/store is legal for all targets
5804  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
5805    return Op;
5806
5807  // Aquire/Release load/store is not legal for targets without a
5808  // dmb or equivalent available.
5809  return SDValue();
5810}
5811
5812static void
5813ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
5814                    SelectionDAG &DAG, unsigned NewOp) {
5815  SDLoc dl(Node);
5816  assert (Node->getValueType(0) == MVT::i64 &&
5817          "Only know how to expand i64 atomics");
5818
5819  SmallVector<SDValue, 6> Ops;
5820  Ops.push_back(Node->getOperand(0)); // Chain
5821  Ops.push_back(Node->getOperand(1)); // Ptr
5822  // Low part of Val1
5823  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5824                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
5825  // High part of Val1
5826  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5827                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
5828  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
5829    // High part of Val1
5830    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5831                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
5832    // High part of Val2
5833    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5834                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
5835  }
5836  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
5837  SDValue Result =
5838    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
5839                            cast<MemSDNode>(Node)->getMemOperand());
5840  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
5841  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
5842  Results.push_back(Result.getValue(2));
5843}
5844
5845static void ReplaceREADCYCLECOUNTER(SDNode *N,
5846                                    SmallVectorImpl<SDValue> &Results,
5847                                    SelectionDAG &DAG,
5848                                    const ARMSubtarget *Subtarget) {
5849  SDLoc DL(N);
5850  SDValue Cycles32, OutChain;
5851
5852  if (Subtarget->hasPerfMon()) {
5853    // Under Power Management extensions, the cycle-count is:
5854    //    mrc p15, #0, <Rt>, c9, c13, #0
5855    SDValue Ops[] = { N->getOperand(0), // Chain
5856                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
5857                      DAG.getConstant(15, MVT::i32),
5858                      DAG.getConstant(0, MVT::i32),
5859                      DAG.getConstant(9, MVT::i32),
5860                      DAG.getConstant(13, MVT::i32),
5861                      DAG.getConstant(0, MVT::i32)
5862    };
5863
5864    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
5865                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
5866                           array_lengthof(Ops));
5867    OutChain = Cycles32.getValue(1);
5868  } else {
5869    // Intrinsic is defined to return 0 on unsupported platforms. Technically
5870    // there are older ARM CPUs that have implementation-specific ways of
5871    // obtaining this information (FIXME!).
5872    Cycles32 = DAG.getConstant(0, MVT::i32);
5873    OutChain = DAG.getEntryNode();
5874  }
5875
5876
5877  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
5878                                 Cycles32, DAG.getConstant(0, MVT::i32));
5879  Results.push_back(Cycles64);
5880  Results.push_back(OutChain);
5881}
5882
5883SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5884  switch (Op.getOpcode()) {
5885  default: llvm_unreachable("Don't know how to custom lower this!");
5886  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
5887  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
5888  case ISD::GlobalAddress:
5889    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
5890      LowerGlobalAddressELF(Op, DAG);
5891  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
5892  case ISD::SELECT:        return LowerSELECT(Op, DAG);
5893  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
5894  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
5895  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
5896  case ISD::VASTART:       return LowerVASTART(Op, DAG);
5897  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
5898  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
5899  case ISD::SINT_TO_FP:
5900  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
5901  case ISD::FP_TO_SINT:
5902  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
5903  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
5904  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
5905  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
5906  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
5907  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
5908  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
5909  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
5910                                                               Subtarget);
5911  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
5912  case ISD::SHL:
5913  case ISD::SRL:
5914  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
5915  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
5916  case ISD::SRL_PARTS:
5917  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
5918  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
5919  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
5920  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
5921  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
5922  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
5923  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
5924  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
5925  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5926  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
5927  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
5928  case ISD::MUL:           return LowerMUL(Op, DAG);
5929  case ISD::SDIV:          return LowerSDIV(Op, DAG);
5930  case ISD::UDIV:          return LowerUDIV(Op, DAG);
5931  case ISD::ADDC:
5932  case ISD::ADDE:
5933  case ISD::SUBC:
5934  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
5935  case ISD::ATOMIC_LOAD:
5936  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
5937  case ISD::SDIVREM:
5938  case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
5939  }
5940}
5941
5942/// ReplaceNodeResults - Replace the results of node with an illegal result
5943/// type with new values built out of custom code.
5944void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
5945                                           SmallVectorImpl<SDValue>&Results,
5946                                           SelectionDAG &DAG) const {
5947  SDValue Res;
5948  switch (N->getOpcode()) {
5949  default:
5950    llvm_unreachable("Don't know how to custom expand this!");
5951  case ISD::BITCAST:
5952    Res = ExpandBITCAST(N, DAG);
5953    break;
5954  case ISD::SIGN_EXTEND:
5955  case ISD::ZERO_EXTEND:
5956    Res = ExpandVectorExtension(N, DAG);
5957    break;
5958  case ISD::SRL:
5959  case ISD::SRA:
5960    Res = Expand64BitShift(N, DAG, Subtarget);
5961    break;
5962  case ISD::READCYCLECOUNTER:
5963    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
5964    return;
5965  case ISD::ATOMIC_LOAD_ADD:
5966    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
5967    return;
5968  case ISD::ATOMIC_LOAD_AND:
5969    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
5970    return;
5971  case ISD::ATOMIC_LOAD_NAND:
5972    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
5973    return;
5974  case ISD::ATOMIC_LOAD_OR:
5975    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
5976    return;
5977  case ISD::ATOMIC_LOAD_SUB:
5978    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
5979    return;
5980  case ISD::ATOMIC_LOAD_XOR:
5981    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
5982    return;
5983  case ISD::ATOMIC_SWAP:
5984    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
5985    return;
5986  case ISD::ATOMIC_CMP_SWAP:
5987    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
5988    return;
5989  case ISD::ATOMIC_LOAD_MIN:
5990    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
5991    return;
5992  case ISD::ATOMIC_LOAD_UMIN:
5993    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
5994    return;
5995  case ISD::ATOMIC_LOAD_MAX:
5996    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
5997    return;
5998  case ISD::ATOMIC_LOAD_UMAX:
5999    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
6000    return;
6001  }
6002  if (Res.getNode())
6003    Results.push_back(Res);
6004}
6005
6006//===----------------------------------------------------------------------===//
6007//                           ARM Scheduler Hooks
6008//===----------------------------------------------------------------------===//
6009
6010MachineBasicBlock *
6011ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
6012                                     MachineBasicBlock *BB,
6013                                     unsigned Size) const {
6014  unsigned dest    = MI->getOperand(0).getReg();
6015  unsigned ptr     = MI->getOperand(1).getReg();
6016  unsigned oldval  = MI->getOperand(2).getReg();
6017  unsigned newval  = MI->getOperand(3).getReg();
6018  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6019  DebugLoc dl = MI->getDebugLoc();
6020  bool isThumb2 = Subtarget->isThumb2();
6021
6022  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6023  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
6024    (const TargetRegisterClass*)&ARM::rGPRRegClass :
6025    (const TargetRegisterClass*)&ARM::GPRRegClass);
6026
6027  if (isThumb2) {
6028    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6029    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
6030    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
6031  }
6032
6033  unsigned ldrOpc, strOpc;
6034  switch (Size) {
6035  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6036  case 1:
6037    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6038    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6039    break;
6040  case 2:
6041    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6042    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6043    break;
6044  case 4:
6045    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6046    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6047    break;
6048  }
6049
6050  MachineFunction *MF = BB->getParent();
6051  const BasicBlock *LLVM_BB = BB->getBasicBlock();
6052  MachineFunction::iterator It = BB;
6053  ++It; // insert the new blocks after the current block
6054
6055  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
6056  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
6057  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6058  MF->insert(It, loop1MBB);
6059  MF->insert(It, loop2MBB);
6060  MF->insert(It, exitMBB);
6061
6062  // Transfer the remainder of BB and its successor edges to exitMBB.
6063  exitMBB->splice(exitMBB->begin(), BB,
6064                  llvm::next(MachineBasicBlock::iterator(MI)),
6065                  BB->end());
6066  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6067
6068  //  thisMBB:
6069  //   ...
6070  //   fallthrough --> loop1MBB
6071  BB->addSuccessor(loop1MBB);
6072
6073  // loop1MBB:
6074  //   ldrex dest, [ptr]
6075  //   cmp dest, oldval
6076  //   bne exitMBB
6077  BB = loop1MBB;
6078  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6079  if (ldrOpc == ARM::t2LDREX)
6080    MIB.addImm(0);
6081  AddDefaultPred(MIB);
6082  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6083                 .addReg(dest).addReg(oldval));
6084  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6085    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6086  BB->addSuccessor(loop2MBB);
6087  BB->addSuccessor(exitMBB);
6088
6089  // loop2MBB:
6090  //   strex scratch, newval, [ptr]
6091  //   cmp scratch, #0
6092  //   bne loop1MBB
6093  BB = loop2MBB;
6094  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
6095  if (strOpc == ARM::t2STREX)
6096    MIB.addImm(0);
6097  AddDefaultPred(MIB);
6098  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6099                 .addReg(scratch).addImm(0));
6100  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6101    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6102  BB->addSuccessor(loop1MBB);
6103  BB->addSuccessor(exitMBB);
6104
6105  //  exitMBB:
6106  //   ...
6107  BB = exitMBB;
6108
6109  MI->eraseFromParent();   // The instruction is gone now.
6110
6111  return BB;
6112}
6113
6114MachineBasicBlock *
6115ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
6116                                    unsigned Size, unsigned BinOpcode) const {
6117  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
6118  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6119
6120  const BasicBlock *LLVM_BB = BB->getBasicBlock();
6121  MachineFunction *MF = BB->getParent();
6122  MachineFunction::iterator It = BB;
6123  ++It;
6124
6125  unsigned dest = MI->getOperand(0).getReg();
6126  unsigned ptr = MI->getOperand(1).getReg();
6127  unsigned incr = MI->getOperand(2).getReg();
6128  DebugLoc dl = MI->getDebugLoc();
6129  bool isThumb2 = Subtarget->isThumb2();
6130
6131  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6132  if (isThumb2) {
6133    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6134    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6135  }
6136
6137  unsigned ldrOpc, strOpc;
6138  switch (Size) {
6139  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6140  case 1:
6141    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6142    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6143    break;
6144  case 2:
6145    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6146    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6147    break;
6148  case 4:
6149    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6150    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6151    break;
6152  }
6153
6154  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6155  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6156  MF->insert(It, loopMBB);
6157  MF->insert(It, exitMBB);
6158
6159  // Transfer the remainder of BB and its successor edges to exitMBB.
6160  exitMBB->splice(exitMBB->begin(), BB,
6161                  llvm::next(MachineBasicBlock::iterator(MI)),
6162                  BB->end());
6163  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6164
6165  const TargetRegisterClass *TRC = isThumb2 ?
6166    (const TargetRegisterClass*)&ARM::rGPRRegClass :
6167    (const TargetRegisterClass*)&ARM::GPRRegClass;
6168  unsigned scratch = MRI.createVirtualRegister(TRC);
6169  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
6170
6171  //  thisMBB:
6172  //   ...
6173  //   fallthrough --> loopMBB
6174  BB->addSuccessor(loopMBB);
6175
6176  //  loopMBB:
6177  //   ldrex dest, ptr
6178  //   <binop> scratch2, dest, incr
6179  //   strex scratch, scratch2, ptr
6180  //   cmp scratch, #0
6181  //   bne- loopMBB
6182  //   fallthrough --> exitMBB
6183  BB = loopMBB;
6184  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6185  if (ldrOpc == ARM::t2LDREX)
6186    MIB.addImm(0);
6187  AddDefaultPred(MIB);
6188  if (BinOpcode) {
6189    // operand order needs to go the other way for NAND
6190    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
6191      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
6192                     addReg(incr).addReg(dest)).addReg(0);
6193    else
6194      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
6195                     addReg(dest).addReg(incr)).addReg(0);
6196  }
6197
6198  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
6199  if (strOpc == ARM::t2STREX)
6200    MIB.addImm(0);
6201  AddDefaultPred(MIB);
6202  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6203                 .addReg(scratch).addImm(0));
6204  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6205    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6206
6207  BB->addSuccessor(loopMBB);
6208  BB->addSuccessor(exitMBB);
6209
6210  //  exitMBB:
6211  //   ...
6212  BB = exitMBB;
6213
6214  MI->eraseFromParent();   // The instruction is gone now.
6215
6216  return BB;
6217}
6218
6219MachineBasicBlock *
6220ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
6221                                          MachineBasicBlock *BB,
6222                                          unsigned Size,
6223                                          bool signExtend,
6224                                          ARMCC::CondCodes Cond) const {
6225  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6226
6227  const BasicBlock *LLVM_BB = BB->getBasicBlock();
6228  MachineFunction *MF = BB->getParent();
6229  MachineFunction::iterator It = BB;
6230  ++It;
6231
6232  unsigned dest = MI->getOperand(0).getReg();
6233  unsigned ptr = MI->getOperand(1).getReg();
6234  unsigned incr = MI->getOperand(2).getReg();
6235  unsigned oldval = dest;
6236  DebugLoc dl = MI->getDebugLoc();
6237  bool isThumb2 = Subtarget->isThumb2();
6238
6239  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6240  if (isThumb2) {
6241    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
6242    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6243  }
6244
6245  unsigned ldrOpc, strOpc, extendOpc;
6246  switch (Size) {
6247  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
6248  case 1:
6249    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
6250    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
6251    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
6252    break;
6253  case 2:
6254    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
6255    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
6256    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
6257    break;
6258  case 4:
6259    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
6260    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
6261    extendOpc = 0;
6262    break;
6263  }
6264
6265  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6266  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6267  MF->insert(It, loopMBB);
6268  MF->insert(It, exitMBB);
6269
6270  // Transfer the remainder of BB and its successor edges to exitMBB.
6271  exitMBB->splice(exitMBB->begin(), BB,
6272                  llvm::next(MachineBasicBlock::iterator(MI)),
6273                  BB->end());
6274  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6275
6276  const TargetRegisterClass *TRC = isThumb2 ?
6277    (const TargetRegisterClass*)&ARM::rGPRRegClass :
6278    (const TargetRegisterClass*)&ARM::GPRRegClass;
6279  unsigned scratch = MRI.createVirtualRegister(TRC);
6280  unsigned scratch2 = MRI.createVirtualRegister(TRC);
6281
6282  //  thisMBB:
6283  //   ...
6284  //   fallthrough --> loopMBB
6285  BB->addSuccessor(loopMBB);
6286
6287  //  loopMBB:
6288  //   ldrex dest, ptr
6289  //   (sign extend dest, if required)
6290  //   cmp dest, incr
6291  //   cmov.cond scratch2, incr, dest
6292  //   strex scratch, scratch2, ptr
6293  //   cmp scratch, #0
6294  //   bne- loopMBB
6295  //   fallthrough --> exitMBB
6296  BB = loopMBB;
6297  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
6298  if (ldrOpc == ARM::t2LDREX)
6299    MIB.addImm(0);
6300  AddDefaultPred(MIB);
6301
6302  // Sign extend the value, if necessary.
6303  if (signExtend && extendOpc) {
6304    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
6305    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
6306                     .addReg(dest)
6307                     .addImm(0));
6308  }
6309
6310  // Build compare and cmov instructions.
6311  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6312                 .addReg(oldval).addReg(incr));
6313  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
6314         .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
6315
6316  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
6317  if (strOpc == ARM::t2STREX)
6318    MIB.addImm(0);
6319  AddDefaultPred(MIB);
6320  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6321                 .addReg(scratch).addImm(0));
6322  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6323    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6324
6325  BB->addSuccessor(loopMBB);
6326  BB->addSuccessor(exitMBB);
6327
6328  //  exitMBB:
6329  //   ...
6330  BB = exitMBB;
6331
6332  MI->eraseFromParent();   // The instruction is gone now.
6333
6334  return BB;
6335}
6336
6337MachineBasicBlock *
6338ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
6339                                      unsigned Op1, unsigned Op2,
6340                                      bool NeedsCarry, bool IsCmpxchg,
6341                                      bool IsMinMax, ARMCC::CondCodes CC) const {
6342  // This also handles ATOMIC_SWAP, indicated by Op1==0.
6343  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6344
6345  const BasicBlock *LLVM_BB = BB->getBasicBlock();
6346  MachineFunction *MF = BB->getParent();
6347  MachineFunction::iterator It = BB;
6348  ++It;
6349
6350  unsigned destlo = MI->getOperand(0).getReg();
6351  unsigned desthi = MI->getOperand(1).getReg();
6352  unsigned ptr = MI->getOperand(2).getReg();
6353  unsigned vallo = MI->getOperand(3).getReg();
6354  unsigned valhi = MI->getOperand(4).getReg();
6355  DebugLoc dl = MI->getDebugLoc();
6356  bool isThumb2 = Subtarget->isThumb2();
6357
6358  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
6359  if (isThumb2) {
6360    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
6361    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
6362    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
6363  }
6364
6365  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6366  MachineBasicBlock *contBB = 0, *cont2BB = 0;
6367  if (IsCmpxchg || IsMinMax)
6368    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
6369  if (IsCmpxchg)
6370    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
6371  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6372
6373  MF->insert(It, loopMBB);
6374  if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
6375  if (IsCmpxchg) MF->insert(It, cont2BB);
6376  MF->insert(It, exitMBB);
6377
6378  // Transfer the remainder of BB and its successor edges to exitMBB.
6379  exitMBB->splice(exitMBB->begin(), BB,
6380                  llvm::next(MachineBasicBlock::iterator(MI)),
6381                  BB->end());
6382  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6383
6384  const TargetRegisterClass *TRC = isThumb2 ?
6385    (const TargetRegisterClass*)&ARM::tGPRRegClass :
6386    (const TargetRegisterClass*)&ARM::GPRRegClass;
6387  unsigned storesuccess = MRI.createVirtualRegister(TRC);
6388
6389  //  thisMBB:
6390  //   ...
6391  //   fallthrough --> loopMBB
6392  BB->addSuccessor(loopMBB);
6393
6394  //  loopMBB:
6395  //   ldrexd r2, r3, ptr
6396  //   <binopa> r0, r2, incr
6397  //   <binopb> r1, r3, incr
6398  //   strexd storesuccess, r0, r1, ptr
6399  //   cmp storesuccess, #0
6400  //   bne- loopMBB
6401  //   fallthrough --> exitMBB
6402  BB = loopMBB;
6403
6404  // Load
6405  if (isThumb2) {
6406    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
6407                   .addReg(destlo, RegState::Define)
6408                   .addReg(desthi, RegState::Define)
6409                   .addReg(ptr));
6410  } else {
6411    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6412    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
6413                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
6414    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
6415    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
6416      .addReg(GPRPair0, 0, ARM::gsub_0);
6417    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
6418      .addReg(GPRPair0, 0, ARM::gsub_1);
6419  }
6420
6421  unsigned StoreLo, StoreHi;
6422  if (IsCmpxchg) {
6423    // Add early exit
6424    for (unsigned i = 0; i < 2; i++) {
6425      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
6426                                                         ARM::CMPrr))
6427                     .addReg(i == 0 ? destlo : desthi)
6428                     .addReg(i == 0 ? vallo : valhi));
6429      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6430        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6431      BB->addSuccessor(exitMBB);
6432      BB->addSuccessor(i == 0 ? contBB : cont2BB);
6433      BB = (i == 0 ? contBB : cont2BB);
6434    }
6435
6436    // Copy to physregs for strexd
6437    StoreLo = MI->getOperand(5).getReg();
6438    StoreHi = MI->getOperand(6).getReg();
6439  } else if (Op1) {
6440    // Perform binary operation
6441    unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
6442    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
6443                   .addReg(destlo).addReg(vallo))
6444        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
6445    unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
6446    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
6447                   .addReg(desthi).addReg(valhi))
6448        .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
6449
6450    StoreLo = tmpRegLo;
6451    StoreHi = tmpRegHi;
6452  } else {
6453    // Copy to physregs for strexd
6454    StoreLo = vallo;
6455    StoreHi = valhi;
6456  }
6457  if (IsMinMax) {
6458    // Compare and branch to exit block.
6459    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6460      .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
6461    BB->addSuccessor(exitMBB);
6462    BB->addSuccessor(contBB);
6463    BB = contBB;
6464    StoreLo = vallo;
6465    StoreHi = valhi;
6466  }
6467
6468  // Store
6469  if (isThumb2) {
6470    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
6471                   .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
6472  } else {
6473    // Marshal a pair...
6474    unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6475    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6476    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
6477    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
6478    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
6479      .addReg(UndefPair)
6480      .addReg(StoreLo)
6481      .addImm(ARM::gsub_0);
6482    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
6483      .addReg(r1)
6484      .addReg(StoreHi)
6485      .addImm(ARM::gsub_1);
6486
6487    // ...and store it
6488    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
6489                   .addReg(StorePair).addReg(ptr));
6490  }
6491  // Cmp+jump
6492  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6493                 .addReg(storesuccess).addImm(0));
6494  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6495    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6496
6497  BB->addSuccessor(loopMBB);
6498  BB->addSuccessor(exitMBB);
6499
6500  //  exitMBB:
6501  //   ...
6502  BB = exitMBB;
6503
6504  MI->eraseFromParent();   // The instruction is gone now.
6505
6506  return BB;
6507}
6508
6509/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
6510/// registers the function context.
6511void ARMTargetLowering::
6512SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
6513                       MachineBasicBlock *DispatchBB, int FI) const {
6514  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6515  DebugLoc dl = MI->getDebugLoc();
6516  MachineFunction *MF = MBB->getParent();
6517  MachineRegisterInfo *MRI = &MF->getRegInfo();
6518  MachineConstantPool *MCP = MF->getConstantPool();
6519  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
6520  const Function *F = MF->getFunction();
6521
6522  bool isThumb = Subtarget->isThumb();
6523  bool isThumb2 = Subtarget->isThumb2();
6524
6525  unsigned PCLabelId = AFI->createPICLabelUId();
6526  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
6527  ARMConstantPoolValue *CPV =
6528    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
6529  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
6530
6531  const TargetRegisterClass *TRC = isThumb ?
6532    (const TargetRegisterClass*)&ARM::tGPRRegClass :
6533    (const TargetRegisterClass*)&ARM::GPRRegClass;
6534
6535  // Grab constant pool and fixed stack memory operands.
6536  MachineMemOperand *CPMMO =
6537    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
6538                             MachineMemOperand::MOLoad, 4, 4);
6539
6540  MachineMemOperand *FIMMOSt =
6541    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
6542                             MachineMemOperand::MOStore, 4, 4);
6543
6544  // Load the address of the dispatch MBB into the jump buffer.
6545  if (isThumb2) {
6546    // Incoming value: jbuf
6547    //   ldr.n  r5, LCPI1_1
6548    //   orr    r5, r5, #1
6549    //   add    r5, pc
6550    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
6551    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6552    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
6553                   .addConstantPoolIndex(CPI)
6554                   .addMemOperand(CPMMO));
6555    // Set the low bit because of thumb mode.
6556    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6557    AddDefaultCC(
6558      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
6559                     .addReg(NewVReg1, RegState::Kill)
6560                     .addImm(0x01)));
6561    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6562    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
6563      .addReg(NewVReg2, RegState::Kill)
6564      .addImm(PCLabelId);
6565    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
6566                   .addReg(NewVReg3, RegState::Kill)
6567                   .addFrameIndex(FI)
6568                   .addImm(36)  // &jbuf[1] :: pc
6569                   .addMemOperand(FIMMOSt));
6570  } else if (isThumb) {
6571    // Incoming value: jbuf
6572    //   ldr.n  r1, LCPI1_4
6573    //   add    r1, pc
6574    //   mov    r2, #1
6575    //   orrs   r1, r2
6576    //   add    r2, $jbuf, #+4 ; &jbuf[1]
6577    //   str    r1, [r2]
6578    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6579    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
6580                   .addConstantPoolIndex(CPI)
6581                   .addMemOperand(CPMMO));
6582    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6583    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
6584      .addReg(NewVReg1, RegState::Kill)
6585      .addImm(PCLabelId);
6586    // Set the low bit because of thumb mode.
6587    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6588    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
6589                   .addReg(ARM::CPSR, RegState::Define)
6590                   .addImm(1));
6591    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6592    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
6593                   .addReg(ARM::CPSR, RegState::Define)
6594                   .addReg(NewVReg2, RegState::Kill)
6595                   .addReg(NewVReg3, RegState::Kill));
6596    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6597    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
6598                   .addFrameIndex(FI)
6599                   .addImm(36)); // &jbuf[1] :: pc
6600    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
6601                   .addReg(NewVReg4, RegState::Kill)
6602                   .addReg(NewVReg5, RegState::Kill)
6603                   .addImm(0)
6604                   .addMemOperand(FIMMOSt));
6605  } else {
6606    // Incoming value: jbuf
6607    //   ldr  r1, LCPI1_1
6608    //   add  r1, pc, r1
6609    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
6610    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6611    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
6612                   .addConstantPoolIndex(CPI)
6613                   .addImm(0)
6614                   .addMemOperand(CPMMO));
6615    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6616    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
6617                   .addReg(NewVReg1, RegState::Kill)
6618                   .addImm(PCLabelId));
6619    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
6620                   .addReg(NewVReg2, RegState::Kill)
6621                   .addFrameIndex(FI)
6622                   .addImm(36)  // &jbuf[1] :: pc
6623                   .addMemOperand(FIMMOSt));
6624  }
6625}
6626
6627MachineBasicBlock *ARMTargetLowering::
6628EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
6629  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6630  DebugLoc dl = MI->getDebugLoc();
6631  MachineFunction *MF = MBB->getParent();
6632  MachineRegisterInfo *MRI = &MF->getRegInfo();
6633  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
6634  MachineFrameInfo *MFI = MF->getFrameInfo();
6635  int FI = MFI->getFunctionContextIndex();
6636
6637  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
6638    (const TargetRegisterClass*)&ARM::tGPRRegClass :
6639    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
6640
6641  // Get a mapping of the call site numbers to all of the landing pads they're
6642  // associated with.
6643  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
6644  unsigned MaxCSNum = 0;
6645  MachineModuleInfo &MMI = MF->getMMI();
6646  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
6647       ++BB) {
6648    if (!BB->isLandingPad()) continue;
6649
6650    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
6651    // pad.
6652    for (MachineBasicBlock::iterator
6653           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
6654      if (!II->isEHLabel()) continue;
6655
6656      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
6657      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
6658
6659      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
6660      for (SmallVectorImpl<unsigned>::iterator
6661             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
6662           CSI != CSE; ++CSI) {
6663        CallSiteNumToLPad[*CSI].push_back(BB);
6664        MaxCSNum = std::max(MaxCSNum, *CSI);
6665      }
6666      break;
6667    }
6668  }
6669
6670  // Get an ordered list of the machine basic blocks for the jump table.
6671  std::vector<MachineBasicBlock*> LPadList;
6672  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
6673  LPadList.reserve(CallSiteNumToLPad.size());
6674  for (unsigned I = 1; I <= MaxCSNum; ++I) {
6675    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
6676    for (SmallVectorImpl<MachineBasicBlock*>::iterator
6677           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
6678      LPadList.push_back(*II);
6679      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
6680    }
6681  }
6682
6683  assert(!LPadList.empty() &&
6684         "No landing pad destinations for the dispatch jump table!");
6685
6686  // Create the jump table and associated information.
6687  MachineJumpTableInfo *JTI =
6688    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
6689  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
6690  unsigned UId = AFI->createJumpTableUId();
6691  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
6692
6693  // Create the MBBs for the dispatch code.
6694
6695  // Shove the dispatch's address into the return slot in the function context.
6696  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
6697  DispatchBB->setIsLandingPad();
6698
6699  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6700  unsigned trap_opcode;
6701  if (Subtarget->isThumb())
6702    trap_opcode = ARM::tTRAP;
6703  else
6704    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
6705
6706  BuildMI(TrapBB, dl, TII->get(trap_opcode));
6707  DispatchBB->addSuccessor(TrapBB);
6708
6709  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
6710  DispatchBB->addSuccessor(DispContBB);
6711
6712  // Insert and MBBs.
6713  MF->insert(MF->end(), DispatchBB);
6714  MF->insert(MF->end(), DispContBB);
6715  MF->insert(MF->end(), TrapBB);
6716
6717  // Insert code into the entry block that creates and registers the function
6718  // context.
6719  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
6720
6721  MachineMemOperand *FIMMOLd =
6722    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
6723                             MachineMemOperand::MOLoad |
6724                             MachineMemOperand::MOVolatile, 4, 4);
6725
6726  MachineInstrBuilder MIB;
6727  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
6728
6729  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
6730  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
6731
6732  // Add a register mask with no preserved registers.  This results in all
6733  // registers being marked as clobbered.
6734  MIB.addRegMask(RI.getNoPreservedMask());
6735
6736  unsigned NumLPads = LPadList.size();
6737  if (Subtarget->isThumb2()) {
6738    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6739    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
6740                   .addFrameIndex(FI)
6741                   .addImm(4)
6742                   .addMemOperand(FIMMOLd));
6743
6744    if (NumLPads < 256) {
6745      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
6746                     .addReg(NewVReg1)
6747                     .addImm(LPadList.size()));
6748    } else {
6749      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6750      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
6751                     .addImm(NumLPads & 0xFFFF));
6752
6753      unsigned VReg2 = VReg1;
6754      if ((NumLPads & 0xFFFF0000) != 0) {
6755        VReg2 = MRI->createVirtualRegister(TRC);
6756        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
6757                       .addReg(VReg1)
6758                       .addImm(NumLPads >> 16));
6759      }
6760
6761      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
6762                     .addReg(NewVReg1)
6763                     .addReg(VReg2));
6764    }
6765
6766    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
6767      .addMBB(TrapBB)
6768      .addImm(ARMCC::HI)
6769      .addReg(ARM::CPSR);
6770
6771    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6772    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
6773                   .addJumpTableIndex(MJTI)
6774                   .addImm(UId));
6775
6776    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6777    AddDefaultCC(
6778      AddDefaultPred(
6779        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
6780        .addReg(NewVReg3, RegState::Kill)
6781        .addReg(NewVReg1)
6782        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
6783
6784    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
6785      .addReg(NewVReg4, RegState::Kill)
6786      .addReg(NewVReg1)
6787      .addJumpTableIndex(MJTI)
6788      .addImm(UId);
6789  } else if (Subtarget->isThumb()) {
6790    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6791    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
6792                   .addFrameIndex(FI)
6793                   .addImm(1)
6794                   .addMemOperand(FIMMOLd));
6795
6796    if (NumLPads < 256) {
6797      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
6798                     .addReg(NewVReg1)
6799                     .addImm(NumLPads));
6800    } else {
6801      MachineConstantPool *ConstantPool = MF->getConstantPool();
6802      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6803      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6804
6805      // MachineConstantPool wants an explicit alignment.
6806      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
6807      if (Align == 0)
6808        Align = getDataLayout()->getTypeAllocSize(C->getType());
6809      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6810
6811      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6812      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
6813                     .addReg(VReg1, RegState::Define)
6814                     .addConstantPoolIndex(Idx));
6815      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
6816                     .addReg(NewVReg1)
6817                     .addReg(VReg1));
6818    }
6819
6820    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
6821      .addMBB(TrapBB)
6822      .addImm(ARMCC::HI)
6823      .addReg(ARM::CPSR);
6824
6825    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6826    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
6827                   .addReg(ARM::CPSR, RegState::Define)
6828                   .addReg(NewVReg1)
6829                   .addImm(2));
6830
6831    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6832    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
6833                   .addJumpTableIndex(MJTI)
6834                   .addImm(UId));
6835
6836    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6837    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
6838                   .addReg(ARM::CPSR, RegState::Define)
6839                   .addReg(NewVReg2, RegState::Kill)
6840                   .addReg(NewVReg3));
6841
6842    MachineMemOperand *JTMMOLd =
6843      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
6844                               MachineMemOperand::MOLoad, 4, 4);
6845
6846    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6847    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
6848                   .addReg(NewVReg4, RegState::Kill)
6849                   .addImm(0)
6850                   .addMemOperand(JTMMOLd));
6851
6852    unsigned NewVReg6 = NewVReg5;
6853    if (RelocM == Reloc::PIC_) {
6854      NewVReg6 = MRI->createVirtualRegister(TRC);
6855      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
6856                     .addReg(ARM::CPSR, RegState::Define)
6857                     .addReg(NewVReg5, RegState::Kill)
6858                     .addReg(NewVReg3));
6859    }
6860
6861    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
6862      .addReg(NewVReg6, RegState::Kill)
6863      .addJumpTableIndex(MJTI)
6864      .addImm(UId);
6865  } else {
6866    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6867    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
6868                   .addFrameIndex(FI)
6869                   .addImm(4)
6870                   .addMemOperand(FIMMOLd));
6871
6872    if (NumLPads < 256) {
6873      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
6874                     .addReg(NewVReg1)
6875                     .addImm(NumLPads));
6876    } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
6877      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6878      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
6879                     .addImm(NumLPads & 0xFFFF));
6880
6881      unsigned VReg2 = VReg1;
6882      if ((NumLPads & 0xFFFF0000) != 0) {
6883        VReg2 = MRI->createVirtualRegister(TRC);
6884        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
6885                       .addReg(VReg1)
6886                       .addImm(NumLPads >> 16));
6887      }
6888
6889      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
6890                     .addReg(NewVReg1)
6891                     .addReg(VReg2));
6892    } else {
6893      MachineConstantPool *ConstantPool = MF->getConstantPool();
6894      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6895      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6896
6897      // MachineConstantPool wants an explicit alignment.
6898      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
6899      if (Align == 0)
6900        Align = getDataLayout()->getTypeAllocSize(C->getType());
6901      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6902
6903      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6904      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
6905                     .addReg(VReg1, RegState::Define)
6906                     .addConstantPoolIndex(Idx)
6907                     .addImm(0));
6908      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
6909                     .addReg(NewVReg1)
6910                     .addReg(VReg1, RegState::Kill));
6911    }
6912
6913    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
6914      .addMBB(TrapBB)
6915      .addImm(ARMCC::HI)
6916      .addReg(ARM::CPSR);
6917
6918    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6919    AddDefaultCC(
6920      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
6921                     .addReg(NewVReg1)
6922                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
6923    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6924    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
6925                   .addJumpTableIndex(MJTI)
6926                   .addImm(UId));
6927
6928    MachineMemOperand *JTMMOLd =
6929      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
6930                               MachineMemOperand::MOLoad, 4, 4);
6931    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6932    AddDefaultPred(
6933      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
6934      .addReg(NewVReg3, RegState::Kill)
6935      .addReg(NewVReg4)
6936      .addImm(0)
6937      .addMemOperand(JTMMOLd));
6938
6939    if (RelocM == Reloc::PIC_) {
6940      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
6941        .addReg(NewVReg5, RegState::Kill)
6942        .addReg(NewVReg4)
6943        .addJumpTableIndex(MJTI)
6944        .addImm(UId);
6945    } else {
6946      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
6947        .addReg(NewVReg5, RegState::Kill)
6948        .addJumpTableIndex(MJTI)
6949        .addImm(UId);
6950    }
6951  }
6952
6953  // Add the jump table entries as successors to the MBB.
6954  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
6955  for (std::vector<MachineBasicBlock*>::iterator
6956         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
6957    MachineBasicBlock *CurMBB = *I;
6958    if (SeenMBBs.insert(CurMBB))
6959      DispContBB->addSuccessor(CurMBB);
6960  }
6961
6962  // N.B. the order the invoke BBs are processed in doesn't matter here.
6963  const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
6964  SmallVector<MachineBasicBlock*, 64> MBBLPads;
6965  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
6966         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
6967    MachineBasicBlock *BB = *I;
6968
6969    // Remove the landing pad successor from the invoke block and replace it
6970    // with the new dispatch block.
6971    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
6972                                                  BB->succ_end());
6973    while (!Successors.empty()) {
6974      MachineBasicBlock *SMBB = Successors.pop_back_val();
6975      if (SMBB->isLandingPad()) {
6976        BB->removeSuccessor(SMBB);
6977        MBBLPads.push_back(SMBB);
6978      }
6979    }
6980
6981    BB->addSuccessor(DispatchBB);
6982
6983    // Find the invoke call and mark all of the callee-saved registers as
6984    // 'implicit defined' so that they're spilled. This prevents code from
6985    // moving instructions to before the EH block, where they will never be
6986    // executed.
6987    for (MachineBasicBlock::reverse_iterator
6988           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
6989      if (!II->isCall()) continue;
6990
6991      DenseMap<unsigned, bool> DefRegs;
6992      for (MachineInstr::mop_iterator
6993             OI = II->operands_begin(), OE = II->operands_end();
6994           OI != OE; ++OI) {
6995        if (!OI->isReg()) continue;
6996        DefRegs[OI->getReg()] = true;
6997      }
6998
6999      MachineInstrBuilder MIB(*MF, &*II);
7000
7001      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
7002        unsigned Reg = SavedRegs[i];
7003        if (Subtarget->isThumb2() &&
7004            !ARM::tGPRRegClass.contains(Reg) &&
7005            !ARM::hGPRRegClass.contains(Reg))
7006          continue;
7007        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
7008          continue;
7009        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
7010          continue;
7011        if (!DefRegs[Reg])
7012          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
7013      }
7014
7015      break;
7016    }
7017  }
7018
7019  // Mark all former landing pads as non-landing pads. The dispatch is the only
7020  // landing pad now.
7021  for (SmallVectorImpl<MachineBasicBlock*>::iterator
7022         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
7023    (*I)->setIsLandingPad(false);
7024
7025  // The instruction is gone now.
7026  MI->eraseFromParent();
7027
7028  return MBB;
7029}
7030
7031static
7032MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
7033  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
7034       E = MBB->succ_end(); I != E; ++I)
7035    if (*I != Succ)
7036      return *I;
7037  llvm_unreachable("Expecting a BB with two successors!");
7038}
7039
7040MachineBasicBlock *ARMTargetLowering::
7041EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
7042  // This pseudo instruction has 3 operands: dst, src, size
7043  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
7044  // Otherwise, we will generate unrolled scalar copies.
7045  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7046  const BasicBlock *LLVM_BB = BB->getBasicBlock();
7047  MachineFunction::iterator It = BB;
7048  ++It;
7049
7050  unsigned dest = MI->getOperand(0).getReg();
7051  unsigned src = MI->getOperand(1).getReg();
7052  unsigned SizeVal = MI->getOperand(2).getImm();
7053  unsigned Align = MI->getOperand(3).getImm();
7054  DebugLoc dl = MI->getDebugLoc();
7055
7056  bool isThumb2 = Subtarget->isThumb2();
7057  MachineFunction *MF = BB->getParent();
7058  MachineRegisterInfo &MRI = MF->getRegInfo();
7059  unsigned ldrOpc, strOpc, UnitSize = 0;
7060
7061  const TargetRegisterClass *TRC = isThumb2 ?
7062    (const TargetRegisterClass*)&ARM::tGPRRegClass :
7063    (const TargetRegisterClass*)&ARM::GPRRegClass;
7064  const TargetRegisterClass *TRC_Vec = 0;
7065
7066  if (Align & 1) {
7067    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7068    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7069    UnitSize = 1;
7070  } else if (Align & 2) {
7071    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
7072    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
7073    UnitSize = 2;
7074  } else {
7075    // Check whether we can use NEON instructions.
7076    if (!MF->getFunction()->getAttributes().
7077          hasAttribute(AttributeSet::FunctionIndex,
7078                       Attribute::NoImplicitFloat) &&
7079        Subtarget->hasNEON()) {
7080      if ((Align % 16 == 0) && SizeVal >= 16) {
7081        ldrOpc = ARM::VLD1q32wb_fixed;
7082        strOpc = ARM::VST1q32wb_fixed;
7083        UnitSize = 16;
7084        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
7085      }
7086      else if ((Align % 8 == 0) && SizeVal >= 8) {
7087        ldrOpc = ARM::VLD1d32wb_fixed;
7088        strOpc = ARM::VST1d32wb_fixed;
7089        UnitSize = 8;
7090        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
7091      }
7092    }
7093    // Can't use NEON instructions.
7094    if (UnitSize == 0) {
7095      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
7096      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
7097      UnitSize = 4;
7098    }
7099  }
7100
7101  unsigned BytesLeft = SizeVal % UnitSize;
7102  unsigned LoopSize = SizeVal - BytesLeft;
7103
7104  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
7105    // Use LDR and STR to copy.
7106    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
7107    // [destOut] = STR_POST(scratch, destIn, UnitSize)
7108    unsigned srcIn = src;
7109    unsigned destIn = dest;
7110    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
7111      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
7112      unsigned srcOut = MRI.createVirtualRegister(TRC);
7113      unsigned destOut = MRI.createVirtualRegister(TRC);
7114      if (UnitSize >= 8) {
7115        AddDefaultPred(BuildMI(*BB, MI, dl,
7116          TII->get(ldrOpc), scratch)
7117          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
7118
7119        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7120          .addReg(destIn).addImm(0).addReg(scratch));
7121      } else if (isThumb2) {
7122        AddDefaultPred(BuildMI(*BB, MI, dl,
7123          TII->get(ldrOpc), scratch)
7124          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
7125
7126        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7127          .addReg(scratch).addReg(destIn)
7128          .addImm(UnitSize));
7129      } else {
7130        AddDefaultPred(BuildMI(*BB, MI, dl,
7131          TII->get(ldrOpc), scratch)
7132          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
7133          .addImm(UnitSize));
7134
7135        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7136          .addReg(scratch).addReg(destIn)
7137          .addReg(0).addImm(UnitSize));
7138      }
7139      srcIn = srcOut;
7140      destIn = destOut;
7141    }
7142
7143    // Handle the leftover bytes with LDRB and STRB.
7144    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
7145    // [destOut] = STRB_POST(scratch, destIn, 1)
7146    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7147    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7148    for (unsigned i = 0; i < BytesLeft; i++) {
7149      unsigned scratch = MRI.createVirtualRegister(TRC);
7150      unsigned srcOut = MRI.createVirtualRegister(TRC);
7151      unsigned destOut = MRI.createVirtualRegister(TRC);
7152      if (isThumb2) {
7153        AddDefaultPred(BuildMI(*BB, MI, dl,
7154          TII->get(ldrOpc),scratch)
7155          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
7156
7157        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7158          .addReg(scratch).addReg(destIn)
7159          .addReg(0).addImm(1));
7160      } else {
7161        AddDefaultPred(BuildMI(*BB, MI, dl,
7162          TII->get(ldrOpc),scratch)
7163          .addReg(srcOut, RegState::Define).addReg(srcIn)
7164          .addReg(0).addImm(1));
7165
7166        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
7167          .addReg(scratch).addReg(destIn)
7168          .addReg(0).addImm(1));
7169      }
7170      srcIn = srcOut;
7171      destIn = destOut;
7172    }
7173    MI->eraseFromParent();   // The instruction is gone now.
7174    return BB;
7175  }
7176
7177  // Expand the pseudo op to a loop.
7178  // thisMBB:
7179  //   ...
7180  //   movw varEnd, # --> with thumb2
7181  //   movt varEnd, #
7182  //   ldrcp varEnd, idx --> without thumb2
7183  //   fallthrough --> loopMBB
7184  // loopMBB:
7185  //   PHI varPhi, varEnd, varLoop
7186  //   PHI srcPhi, src, srcLoop
7187  //   PHI destPhi, dst, destLoop
7188  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7189  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
7190  //   subs varLoop, varPhi, #UnitSize
7191  //   bne loopMBB
7192  //   fallthrough --> exitMBB
7193  // exitMBB:
7194  //   epilogue to handle left-over bytes
7195  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7196  //   [destOut] = STRB_POST(scratch, destLoop, 1)
7197  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7198  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
7199  MF->insert(It, loopMBB);
7200  MF->insert(It, exitMBB);
7201
7202  // Transfer the remainder of BB and its successor edges to exitMBB.
7203  exitMBB->splice(exitMBB->begin(), BB,
7204                  llvm::next(MachineBasicBlock::iterator(MI)),
7205                  BB->end());
7206  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
7207
7208  // Load an immediate to varEnd.
7209  unsigned varEnd = MRI.createVirtualRegister(TRC);
7210  if (isThumb2) {
7211    unsigned VReg1 = varEnd;
7212    if ((LoopSize & 0xFFFF0000) != 0)
7213      VReg1 = MRI.createVirtualRegister(TRC);
7214    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
7215                   .addImm(LoopSize & 0xFFFF));
7216
7217    if ((LoopSize & 0xFFFF0000) != 0)
7218      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
7219                     .addReg(VReg1)
7220                     .addImm(LoopSize >> 16));
7221  } else {
7222    MachineConstantPool *ConstantPool = MF->getConstantPool();
7223    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
7224    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
7225
7226    // MachineConstantPool wants an explicit alignment.
7227    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
7228    if (Align == 0)
7229      Align = getDataLayout()->getTypeAllocSize(C->getType());
7230    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
7231
7232    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
7233                   .addReg(varEnd, RegState::Define)
7234                   .addConstantPoolIndex(Idx)
7235                   .addImm(0));
7236  }
7237  BB->addSuccessor(loopMBB);
7238
7239  // Generate the loop body:
7240  //   varPhi = PHI(varLoop, varEnd)
7241  //   srcPhi = PHI(srcLoop, src)
7242  //   destPhi = PHI(destLoop, dst)
7243  MachineBasicBlock *entryBB = BB;
7244  BB = loopMBB;
7245  unsigned varLoop = MRI.createVirtualRegister(TRC);
7246  unsigned varPhi = MRI.createVirtualRegister(TRC);
7247  unsigned srcLoop = MRI.createVirtualRegister(TRC);
7248  unsigned srcPhi = MRI.createVirtualRegister(TRC);
7249  unsigned destLoop = MRI.createVirtualRegister(TRC);
7250  unsigned destPhi = MRI.createVirtualRegister(TRC);
7251
7252  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
7253    .addReg(varLoop).addMBB(loopMBB)
7254    .addReg(varEnd).addMBB(entryBB);
7255  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
7256    .addReg(srcLoop).addMBB(loopMBB)
7257    .addReg(src).addMBB(entryBB);
7258  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
7259    .addReg(destLoop).addMBB(loopMBB)
7260    .addReg(dest).addMBB(entryBB);
7261
7262  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
7263  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
7264  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
7265  if (UnitSize >= 8) {
7266    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7267      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
7268
7269    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7270      .addReg(destPhi).addImm(0).addReg(scratch));
7271  } else if (isThumb2) {
7272    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7273      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
7274
7275    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7276      .addReg(scratch).addReg(destPhi)
7277      .addImm(UnitSize));
7278  } else {
7279    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
7280      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
7281      .addImm(UnitSize));
7282
7283    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
7284      .addReg(scratch).addReg(destPhi)
7285      .addReg(0).addImm(UnitSize));
7286  }
7287
7288  // Decrement loop variable by UnitSize.
7289  MachineInstrBuilder MIB = BuildMI(BB, dl,
7290    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
7291  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
7292  MIB->getOperand(5).setReg(ARM::CPSR);
7293  MIB->getOperand(5).setIsDef(true);
7294
7295  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
7296    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
7297
7298  // loopMBB can loop back to loopMBB or fall through to exitMBB.
7299  BB->addSuccessor(loopMBB);
7300  BB->addSuccessor(exitMBB);
7301
7302  // Add epilogue to handle BytesLeft.
7303  BB = exitMBB;
7304  MachineInstr *StartOfExit = exitMBB->begin();
7305  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
7306  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
7307
7308  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
7309  //   [destOut] = STRB_POST(scratch, destLoop, 1)
7310  unsigned srcIn = srcLoop;
7311  unsigned destIn = destLoop;
7312  for (unsigned i = 0; i < BytesLeft; i++) {
7313    unsigned scratch = MRI.createVirtualRegister(TRC);
7314    unsigned srcOut = MRI.createVirtualRegister(TRC);
7315    unsigned destOut = MRI.createVirtualRegister(TRC);
7316    if (isThumb2) {
7317      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
7318        TII->get(ldrOpc),scratch)
7319        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
7320
7321      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
7322        .addReg(scratch).addReg(destIn)
7323        .addImm(1));
7324    } else {
7325      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
7326        TII->get(ldrOpc),scratch)
7327        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
7328
7329      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
7330        .addReg(scratch).addReg(destIn)
7331        .addReg(0).addImm(1));
7332    }
7333    srcIn = srcOut;
7334    destIn = destOut;
7335  }
7336
7337  MI->eraseFromParent();   // The instruction is gone now.
7338  return BB;
7339}
7340
7341MachineBasicBlock *
7342ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
7343                                               MachineBasicBlock *BB) const {
7344  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7345  DebugLoc dl = MI->getDebugLoc();
7346  bool isThumb2 = Subtarget->isThumb2();
7347  switch (MI->getOpcode()) {
7348  default: {
7349    MI->dump();
7350    llvm_unreachable("Unexpected instr type to insert");
7351  }
7352  // The Thumb2 pre-indexed stores have the same MI operands, they just
7353  // define them differently in the .td files from the isel patterns, so
7354  // they need pseudos.
7355  case ARM::t2STR_preidx:
7356    MI->setDesc(TII->get(ARM::t2STR_PRE));
7357    return BB;
7358  case ARM::t2STRB_preidx:
7359    MI->setDesc(TII->get(ARM::t2STRB_PRE));
7360    return BB;
7361  case ARM::t2STRH_preidx:
7362    MI->setDesc(TII->get(ARM::t2STRH_PRE));
7363    return BB;
7364
7365  case ARM::STRi_preidx:
7366  case ARM::STRBi_preidx: {
7367    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
7368      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
7369    // Decode the offset.
7370    unsigned Offset = MI->getOperand(4).getImm();
7371    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
7372    Offset = ARM_AM::getAM2Offset(Offset);
7373    if (isSub)
7374      Offset = -Offset;
7375
7376    MachineMemOperand *MMO = *MI->memoperands_begin();
7377    BuildMI(*BB, MI, dl, TII->get(NewOpc))
7378      .addOperand(MI->getOperand(0))  // Rn_wb
7379      .addOperand(MI->getOperand(1))  // Rt
7380      .addOperand(MI->getOperand(2))  // Rn
7381      .addImm(Offset)                 // offset (skip GPR==zero_reg)
7382      .addOperand(MI->getOperand(5))  // pred
7383      .addOperand(MI->getOperand(6))
7384      .addMemOperand(MMO);
7385    MI->eraseFromParent();
7386    return BB;
7387  }
7388  case ARM::STRr_preidx:
7389  case ARM::STRBr_preidx:
7390  case ARM::STRH_preidx: {
7391    unsigned NewOpc;
7392    switch (MI->getOpcode()) {
7393    default: llvm_unreachable("unexpected opcode!");
7394    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
7395    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
7396    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
7397    }
7398    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
7399    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
7400      MIB.addOperand(MI->getOperand(i));
7401    MI->eraseFromParent();
7402    return BB;
7403  }
7404  case ARM::ATOMIC_LOAD_ADD_I8:
7405     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7406  case ARM::ATOMIC_LOAD_ADD_I16:
7407     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7408  case ARM::ATOMIC_LOAD_ADD_I32:
7409     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
7410
7411  case ARM::ATOMIC_LOAD_AND_I8:
7412     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7413  case ARM::ATOMIC_LOAD_AND_I16:
7414     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7415  case ARM::ATOMIC_LOAD_AND_I32:
7416     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7417
7418  case ARM::ATOMIC_LOAD_OR_I8:
7419     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7420  case ARM::ATOMIC_LOAD_OR_I16:
7421     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7422  case ARM::ATOMIC_LOAD_OR_I32:
7423     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7424
7425  case ARM::ATOMIC_LOAD_XOR_I8:
7426     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7427  case ARM::ATOMIC_LOAD_XOR_I16:
7428     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7429  case ARM::ATOMIC_LOAD_XOR_I32:
7430     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7431
7432  case ARM::ATOMIC_LOAD_NAND_I8:
7433     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7434  case ARM::ATOMIC_LOAD_NAND_I16:
7435     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7436  case ARM::ATOMIC_LOAD_NAND_I32:
7437     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
7438
7439  case ARM::ATOMIC_LOAD_SUB_I8:
7440     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7441  case ARM::ATOMIC_LOAD_SUB_I16:
7442     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7443  case ARM::ATOMIC_LOAD_SUB_I32:
7444     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
7445
7446  case ARM::ATOMIC_LOAD_MIN_I8:
7447     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
7448  case ARM::ATOMIC_LOAD_MIN_I16:
7449     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
7450  case ARM::ATOMIC_LOAD_MIN_I32:
7451     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
7452
7453  case ARM::ATOMIC_LOAD_MAX_I8:
7454     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
7455  case ARM::ATOMIC_LOAD_MAX_I16:
7456     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
7457  case ARM::ATOMIC_LOAD_MAX_I32:
7458     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
7459
7460  case ARM::ATOMIC_LOAD_UMIN_I8:
7461     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
7462  case ARM::ATOMIC_LOAD_UMIN_I16:
7463     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
7464  case ARM::ATOMIC_LOAD_UMIN_I32:
7465     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
7466
7467  case ARM::ATOMIC_LOAD_UMAX_I8:
7468     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
7469  case ARM::ATOMIC_LOAD_UMAX_I16:
7470     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
7471  case ARM::ATOMIC_LOAD_UMAX_I32:
7472     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
7473
7474  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
7475  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
7476  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
7477
7478  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
7479  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
7480  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
7481
7482
7483  case ARM::ATOMADD6432:
7484    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
7485                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
7486                              /*NeedsCarry*/ true);
7487  case ARM::ATOMSUB6432:
7488    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7489                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7490                              /*NeedsCarry*/ true);
7491  case ARM::ATOMOR6432:
7492    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
7493                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
7494  case ARM::ATOMXOR6432:
7495    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
7496                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
7497  case ARM::ATOMAND6432:
7498    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
7499                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
7500  case ARM::ATOMSWAP6432:
7501    return EmitAtomicBinary64(MI, BB, 0, 0, false);
7502  case ARM::ATOMCMPXCHG6432:
7503    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7504                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7505                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
7506  case ARM::ATOMMIN6432:
7507    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7508                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7509                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7510                              /*IsMinMax*/ true, ARMCC::LT);
7511  case ARM::ATOMMAX6432:
7512    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7513                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7514                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7515                              /*IsMinMax*/ true, ARMCC::GE);
7516  case ARM::ATOMUMIN6432:
7517    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7518                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7519                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7520                              /*IsMinMax*/ true, ARMCC::LO);
7521  case ARM::ATOMUMAX6432:
7522    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
7523                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
7524                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
7525                              /*IsMinMax*/ true, ARMCC::HS);
7526
7527  case ARM::tMOVCCr_pseudo: {
7528    // To "insert" a SELECT_CC instruction, we actually have to insert the
7529    // diamond control-flow pattern.  The incoming instruction knows the
7530    // destination vreg to set, the condition code register to branch on, the
7531    // true/false values to select between, and a branch opcode to use.
7532    const BasicBlock *LLVM_BB = BB->getBasicBlock();
7533    MachineFunction::iterator It = BB;
7534    ++It;
7535
7536    //  thisMBB:
7537    //  ...
7538    //   TrueVal = ...
7539    //   cmpTY ccX, r1, r2
7540    //   bCC copy1MBB
7541    //   fallthrough --> copy0MBB
7542    MachineBasicBlock *thisMBB  = BB;
7543    MachineFunction *F = BB->getParent();
7544    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
7545    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
7546    F->insert(It, copy0MBB);
7547    F->insert(It, sinkMBB);
7548
7549    // Transfer the remainder of BB and its successor edges to sinkMBB.
7550    sinkMBB->splice(sinkMBB->begin(), BB,
7551                    llvm::next(MachineBasicBlock::iterator(MI)),
7552                    BB->end());
7553    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
7554
7555    BB->addSuccessor(copy0MBB);
7556    BB->addSuccessor(sinkMBB);
7557
7558    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
7559      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
7560
7561    //  copy0MBB:
7562    //   %FalseValue = ...
7563    //   # fallthrough to sinkMBB
7564    BB = copy0MBB;
7565
7566    // Update machine-CFG edges
7567    BB->addSuccessor(sinkMBB);
7568
7569    //  sinkMBB:
7570    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
7571    //  ...
7572    BB = sinkMBB;
7573    BuildMI(*BB, BB->begin(), dl,
7574            TII->get(ARM::PHI), MI->getOperand(0).getReg())
7575      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
7576      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
7577
7578    MI->eraseFromParent();   // The pseudo instruction is gone now.
7579    return BB;
7580  }
7581
7582  case ARM::BCCi64:
7583  case ARM::BCCZi64: {
7584    // If there is an unconditional branch to the other successor, remove it.
7585    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
7586
7587    // Compare both parts that make up the double comparison separately for
7588    // equality.
7589    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
7590
7591    unsigned LHS1 = MI->getOperand(1).getReg();
7592    unsigned LHS2 = MI->getOperand(2).getReg();
7593    if (RHSisZero) {
7594      AddDefaultPred(BuildMI(BB, dl,
7595                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7596                     .addReg(LHS1).addImm(0));
7597      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7598        .addReg(LHS2).addImm(0)
7599        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7600    } else {
7601      unsigned RHS1 = MI->getOperand(3).getReg();
7602      unsigned RHS2 = MI->getOperand(4).getReg();
7603      AddDefaultPred(BuildMI(BB, dl,
7604                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7605                     .addReg(LHS1).addReg(RHS1));
7606      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
7607        .addReg(LHS2).addReg(RHS2)
7608        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
7609    }
7610
7611    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
7612    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
7613    if (MI->getOperand(0).getImm() == ARMCC::NE)
7614      std::swap(destMBB, exitMBB);
7615
7616    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
7617      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
7618    if (isThumb2)
7619      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
7620    else
7621      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
7622
7623    MI->eraseFromParent();   // The pseudo instruction is gone now.
7624    return BB;
7625  }
7626
7627  case ARM::Int_eh_sjlj_setjmp:
7628  case ARM::Int_eh_sjlj_setjmp_nofp:
7629  case ARM::tInt_eh_sjlj_setjmp:
7630  case ARM::t2Int_eh_sjlj_setjmp:
7631  case ARM::t2Int_eh_sjlj_setjmp_nofp:
7632    EmitSjLjDispatchBlock(MI, BB);
7633    return BB;
7634
7635  case ARM::ABS:
7636  case ARM::t2ABS: {
7637    // To insert an ABS instruction, we have to insert the
7638    // diamond control-flow pattern.  The incoming instruction knows the
7639    // source vreg to test against 0, the destination vreg to set,
7640    // the condition code register to branch on, the
7641    // true/false values to select between, and a branch opcode to use.
7642    // It transforms
7643    //     V1 = ABS V0
7644    // into
7645    //     V2 = MOVS V0
7646    //     BCC                      (branch to SinkBB if V0 >= 0)
7647    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
7648    //     SinkBB: V1 = PHI(V2, V3)
7649    const BasicBlock *LLVM_BB = BB->getBasicBlock();
7650    MachineFunction::iterator BBI = BB;
7651    ++BBI;
7652    MachineFunction *Fn = BB->getParent();
7653    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
7654    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
7655    Fn->insert(BBI, RSBBB);
7656    Fn->insert(BBI, SinkBB);
7657
7658    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
7659    unsigned int ABSDstReg = MI->getOperand(0).getReg();
7660    bool isThumb2 = Subtarget->isThumb2();
7661    MachineRegisterInfo &MRI = Fn->getRegInfo();
7662    // In Thumb mode S must not be specified if source register is the SP or
7663    // PC and if destination register is the SP, so restrict register class
7664    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
7665      (const TargetRegisterClass*)&ARM::rGPRRegClass :
7666      (const TargetRegisterClass*)&ARM::GPRRegClass);
7667
7668    // Transfer the remainder of BB and its successor edges to sinkMBB.
7669    SinkBB->splice(SinkBB->begin(), BB,
7670      llvm::next(MachineBasicBlock::iterator(MI)),
7671      BB->end());
7672    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
7673
7674    BB->addSuccessor(RSBBB);
7675    BB->addSuccessor(SinkBB);
7676
7677    // fall through to SinkMBB
7678    RSBBB->addSuccessor(SinkBB);
7679
7680    // insert a cmp at the end of BB
7681    AddDefaultPred(BuildMI(BB, dl,
7682                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
7683                   .addReg(ABSSrcReg).addImm(0));
7684
7685    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
7686    BuildMI(BB, dl,
7687      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
7688      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
7689
7690    // insert rsbri in RSBBB
7691    // Note: BCC and rsbri will be converted into predicated rsbmi
7692    // by if-conversion pass
7693    BuildMI(*RSBBB, RSBBB->begin(), dl,
7694      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
7695      .addReg(ABSSrcReg, RegState::Kill)
7696      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
7697
7698    // insert PHI in SinkBB,
7699    // reuse ABSDstReg to not change uses of ABS instruction
7700    BuildMI(*SinkBB, SinkBB->begin(), dl,
7701      TII->get(ARM::PHI), ABSDstReg)
7702      .addReg(NewRsbDstReg).addMBB(RSBBB)
7703      .addReg(ABSSrcReg).addMBB(BB);
7704
7705    // remove ABS instruction
7706    MI->eraseFromParent();
7707
7708    // return last added BB
7709    return SinkBB;
7710  }
7711  case ARM::COPY_STRUCT_BYVAL_I32:
7712    ++NumLoopByVals;
7713    return EmitStructByval(MI, BB);
7714  }
7715}
7716
7717void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
7718                                                      SDNode *Node) const {
7719  if (!MI->hasPostISelHook()) {
7720    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
7721           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
7722    return;
7723  }
7724
7725  const MCInstrDesc *MCID = &MI->getDesc();
7726  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
7727  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
7728  // operand is still set to noreg. If needed, set the optional operand's
7729  // register to CPSR, and remove the redundant implicit def.
7730  //
7731  // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
7732
7733  // Rename pseudo opcodes.
7734  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
7735  if (NewOpc) {
7736    const ARMBaseInstrInfo *TII =
7737      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
7738    MCID = &TII->get(NewOpc);
7739
7740    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
7741           "converted opcode should be the same except for cc_out");
7742
7743    MI->setDesc(*MCID);
7744
7745    // Add the optional cc_out operand
7746    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
7747  }
7748  unsigned ccOutIdx = MCID->getNumOperands() - 1;
7749
7750  // Any ARM instruction that sets the 's' bit should specify an optional
7751  // "cc_out" operand in the last operand position.
7752  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
7753    assert(!NewOpc && "Optional cc_out operand required");
7754    return;
7755  }
7756  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
7757  // since we already have an optional CPSR def.
7758  bool definesCPSR = false;
7759  bool deadCPSR = false;
7760  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
7761       i != e; ++i) {
7762    const MachineOperand &MO = MI->getOperand(i);
7763    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
7764      definesCPSR = true;
7765      if (MO.isDead())
7766        deadCPSR = true;
7767      MI->RemoveOperand(i);
7768      break;
7769    }
7770  }
7771  if (!definesCPSR) {
7772    assert(!NewOpc && "Optional cc_out operand required");
7773    return;
7774  }
7775  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
7776  if (deadCPSR) {
7777    assert(!MI->getOperand(ccOutIdx).getReg() &&
7778           "expect uninitialized optional cc_out operand");
7779    return;
7780  }
7781
7782  // If this instruction was defined with an optional CPSR def and its dag node
7783  // had a live implicit CPSR def, then activate the optional CPSR def.
7784  MachineOperand &MO = MI->getOperand(ccOutIdx);
7785  MO.setReg(ARM::CPSR);
7786  MO.setIsDef(true);
7787}
7788
7789//===----------------------------------------------------------------------===//
7790//                           ARM Optimization Hooks
7791//===----------------------------------------------------------------------===//
7792
7793// Helper function that checks if N is a null or all ones constant.
7794static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
7795  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
7796  if (!C)
7797    return false;
7798  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
7799}
7800
7801// Return true if N is conditionally 0 or all ones.
7802// Detects these expressions where cc is an i1 value:
7803//
7804//   (select cc 0, y)   [AllOnes=0]
7805//   (select cc y, 0)   [AllOnes=0]
7806//   (zext cc)          [AllOnes=0]
7807//   (sext cc)          [AllOnes=0/1]
7808//   (select cc -1, y)  [AllOnes=1]
7809//   (select cc y, -1)  [AllOnes=1]
7810//
7811// Invert is set when N is the null/all ones constant when CC is false.
7812// OtherOp is set to the alternative value of N.
7813static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
7814                                       SDValue &CC, bool &Invert,
7815                                       SDValue &OtherOp,
7816                                       SelectionDAG &DAG) {
7817  switch (N->getOpcode()) {
7818  default: return false;
7819  case ISD::SELECT: {
7820    CC = N->getOperand(0);
7821    SDValue N1 = N->getOperand(1);
7822    SDValue N2 = N->getOperand(2);
7823    if (isZeroOrAllOnes(N1, AllOnes)) {
7824      Invert = false;
7825      OtherOp = N2;
7826      return true;
7827    }
7828    if (isZeroOrAllOnes(N2, AllOnes)) {
7829      Invert = true;
7830      OtherOp = N1;
7831      return true;
7832    }
7833    return false;
7834  }
7835  case ISD::ZERO_EXTEND:
7836    // (zext cc) can never be the all ones value.
7837    if (AllOnes)
7838      return false;
7839    // Fall through.
7840  case ISD::SIGN_EXTEND: {
7841    EVT VT = N->getValueType(0);
7842    CC = N->getOperand(0);
7843    if (CC.getValueType() != MVT::i1)
7844      return false;
7845    Invert = !AllOnes;
7846    if (AllOnes)
7847      // When looking for an AllOnes constant, N is an sext, and the 'other'
7848      // value is 0.
7849      OtherOp = DAG.getConstant(0, VT);
7850    else if (N->getOpcode() == ISD::ZERO_EXTEND)
7851      // When looking for a 0 constant, N can be zext or sext.
7852      OtherOp = DAG.getConstant(1, VT);
7853    else
7854      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
7855    return true;
7856  }
7857  }
7858}
7859
7860// Combine a constant select operand into its use:
7861//
7862//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
7863//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
7864//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
7865//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
7866//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
7867//
7868// The transform is rejected if the select doesn't have a constant operand that
7869// is null, or all ones when AllOnes is set.
7870//
7871// Also recognize sext/zext from i1:
7872//
7873//   (add (zext cc), x) -> (select cc (add x, 1), x)
7874//   (add (sext cc), x) -> (select cc (add x, -1), x)
7875//
7876// These transformations eventually create predicated instructions.
7877//
7878// @param N       The node to transform.
7879// @param Slct    The N operand that is a select.
7880// @param OtherOp The other N operand (x above).
7881// @param DCI     Context.
7882// @param AllOnes Require the select constant to be all ones instead of null.
7883// @returns The new node, or SDValue() on failure.
7884static
7885SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
7886                            TargetLowering::DAGCombinerInfo &DCI,
7887                            bool AllOnes = false) {
7888  SelectionDAG &DAG = DCI.DAG;
7889  EVT VT = N->getValueType(0);
7890  SDValue NonConstantVal;
7891  SDValue CCOp;
7892  bool SwapSelectOps;
7893  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
7894                                  NonConstantVal, DAG))
7895    return SDValue();
7896
7897  // Slct is now know to be the desired identity constant when CC is true.
7898  SDValue TrueVal = OtherOp;
7899  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
7900                                 OtherOp, NonConstantVal);
7901  // Unless SwapSelectOps says CC should be false.
7902  if (SwapSelectOps)
7903    std::swap(TrueVal, FalseVal);
7904
7905  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
7906                     CCOp, TrueVal, FalseVal);
7907}
7908
7909// Attempt combineSelectAndUse on each operand of a commutative operator N.
7910static
7911SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
7912                                       TargetLowering::DAGCombinerInfo &DCI) {
7913  SDValue N0 = N->getOperand(0);
7914  SDValue N1 = N->getOperand(1);
7915  if (N0.getNode()->hasOneUse()) {
7916    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
7917    if (Result.getNode())
7918      return Result;
7919  }
7920  if (N1.getNode()->hasOneUse()) {
7921    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
7922    if (Result.getNode())
7923      return Result;
7924  }
7925  return SDValue();
7926}
7927
7928// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
7929// (only after legalization).
7930static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
7931                                 TargetLowering::DAGCombinerInfo &DCI,
7932                                 const ARMSubtarget *Subtarget) {
7933
7934  // Only perform optimization if after legalize, and if NEON is available. We
7935  // also expected both operands to be BUILD_VECTORs.
7936  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
7937      || N0.getOpcode() != ISD::BUILD_VECTOR
7938      || N1.getOpcode() != ISD::BUILD_VECTOR)
7939    return SDValue();
7940
7941  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
7942  EVT VT = N->getValueType(0);
7943  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
7944    return SDValue();
7945
7946  // Check that the vector operands are of the right form.
7947  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
7948  // operands, where N is the size of the formed vector.
7949  // Each EXTRACT_VECTOR should have the same input vector and odd or even
7950  // index such that we have a pair wise add pattern.
7951
7952  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
7953  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7954    return SDValue();
7955  SDValue Vec = N0->getOperand(0)->getOperand(0);
7956  SDNode *V = Vec.getNode();
7957  unsigned nextIndex = 0;
7958
7959  // For each operands to the ADD which are BUILD_VECTORs,
7960  // check to see if each of their operands are an EXTRACT_VECTOR with
7961  // the same vector and appropriate index.
7962  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
7963    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
7964        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
7965
7966      SDValue ExtVec0 = N0->getOperand(i);
7967      SDValue ExtVec1 = N1->getOperand(i);
7968
7969      // First operand is the vector, verify its the same.
7970      if (V != ExtVec0->getOperand(0).getNode() ||
7971          V != ExtVec1->getOperand(0).getNode())
7972        return SDValue();
7973
7974      // Second is the constant, verify its correct.
7975      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
7976      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
7977
7978      // For the constant, we want to see all the even or all the odd.
7979      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
7980          || C1->getZExtValue() != nextIndex+1)
7981        return SDValue();
7982
7983      // Increment index.
7984      nextIndex+=2;
7985    } else
7986      return SDValue();
7987  }
7988
7989  // Create VPADDL node.
7990  SelectionDAG &DAG = DCI.DAG;
7991  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7992
7993  // Build operand list.
7994  SmallVector<SDValue, 8> Ops;
7995  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
7996                                TLI.getPointerTy()));
7997
7998  // Input is the vector.
7999  Ops.push_back(Vec);
8000
8001  // Get widened type and narrowed type.
8002  MVT widenType;
8003  unsigned numElem = VT.getVectorNumElements();
8004  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8005    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
8006    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
8007    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
8008    default:
8009      llvm_unreachable("Invalid vector element type for padd optimization.");
8010  }
8011
8012  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
8013                            widenType, &Ops[0], Ops.size());
8014  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
8015}
8016
8017static SDValue findMUL_LOHI(SDValue V) {
8018  if (V->getOpcode() == ISD::UMUL_LOHI ||
8019      V->getOpcode() == ISD::SMUL_LOHI)
8020    return V;
8021  return SDValue();
8022}
8023
8024static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
8025                                     TargetLowering::DAGCombinerInfo &DCI,
8026                                     const ARMSubtarget *Subtarget) {
8027
8028  if (Subtarget->isThumb1Only()) return SDValue();
8029
8030  // Only perform the checks after legalize when the pattern is available.
8031  if (DCI.isBeforeLegalize()) return SDValue();
8032
8033  // Look for multiply add opportunities.
8034  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
8035  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
8036  // a glue link from the first add to the second add.
8037  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
8038  // a S/UMLAL instruction.
8039  //          loAdd   UMUL_LOHI
8040  //            \    / :lo    \ :hi
8041  //             \  /          \          [no multiline comment]
8042  //              ADDC         |  hiAdd
8043  //                 \ :glue  /  /
8044  //                  \      /  /
8045  //                    ADDE
8046  //
8047  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
8048  SDValue AddcOp0 = AddcNode->getOperand(0);
8049  SDValue AddcOp1 = AddcNode->getOperand(1);
8050
8051  // Check if the two operands are from the same mul_lohi node.
8052  if (AddcOp0.getNode() == AddcOp1.getNode())
8053    return SDValue();
8054
8055  assert(AddcNode->getNumValues() == 2 &&
8056         AddcNode->getValueType(0) == MVT::i32 &&
8057         "Expect ADDC with two result values. First: i32");
8058
8059  // Check that we have a glued ADDC node.
8060  if (AddcNode->getValueType(1) != MVT::Glue)
8061    return SDValue();
8062
8063  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
8064  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
8065      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
8066      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
8067      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
8068    return SDValue();
8069
8070  // Look for the glued ADDE.
8071  SDNode* AddeNode = AddcNode->getGluedUser();
8072  if (AddeNode == NULL)
8073    return SDValue();
8074
8075  // Make sure it is really an ADDE.
8076  if (AddeNode->getOpcode() != ISD::ADDE)
8077    return SDValue();
8078
8079  assert(AddeNode->getNumOperands() == 3 &&
8080         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
8081         "ADDE node has the wrong inputs");
8082
8083  // Check for the triangle shape.
8084  SDValue AddeOp0 = AddeNode->getOperand(0);
8085  SDValue AddeOp1 = AddeNode->getOperand(1);
8086
8087  // Make sure that the ADDE operands are not coming from the same node.
8088  if (AddeOp0.getNode() == AddeOp1.getNode())
8089    return SDValue();
8090
8091  // Find the MUL_LOHI node walking up ADDE's operands.
8092  bool IsLeftOperandMUL = false;
8093  SDValue MULOp = findMUL_LOHI(AddeOp0);
8094  if (MULOp == SDValue())
8095   MULOp = findMUL_LOHI(AddeOp1);
8096  else
8097    IsLeftOperandMUL = true;
8098  if (MULOp == SDValue())
8099     return SDValue();
8100
8101  // Figure out the right opcode.
8102  unsigned Opc = MULOp->getOpcode();
8103  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
8104
8105  // Figure out the high and low input values to the MLAL node.
8106  SDValue* HiMul = &MULOp;
8107  SDValue* HiAdd = NULL;
8108  SDValue* LoMul = NULL;
8109  SDValue* LowAdd = NULL;
8110
8111  if (IsLeftOperandMUL)
8112    HiAdd = &AddeOp1;
8113  else
8114    HiAdd = &AddeOp0;
8115
8116
8117  if (AddcOp0->getOpcode() == Opc) {
8118    LoMul = &AddcOp0;
8119    LowAdd = &AddcOp1;
8120  }
8121  if (AddcOp1->getOpcode() == Opc) {
8122    LoMul = &AddcOp1;
8123    LowAdd = &AddcOp0;
8124  }
8125
8126  if (LoMul == NULL)
8127    return SDValue();
8128
8129  if (LoMul->getNode() != HiMul->getNode())
8130    return SDValue();
8131
8132  // Create the merged node.
8133  SelectionDAG &DAG = DCI.DAG;
8134
8135  // Build operand list.
8136  SmallVector<SDValue, 8> Ops;
8137  Ops.push_back(LoMul->getOperand(0));
8138  Ops.push_back(LoMul->getOperand(1));
8139  Ops.push_back(*LowAdd);
8140  Ops.push_back(*HiAdd);
8141
8142  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
8143                                 DAG.getVTList(MVT::i32, MVT::i32),
8144                                 &Ops[0], Ops.size());
8145
8146  // Replace the ADDs' nodes uses by the MLA node's values.
8147  SDValue HiMLALResult(MLALNode.getNode(), 1);
8148  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
8149
8150  SDValue LoMLALResult(MLALNode.getNode(), 0);
8151  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
8152
8153  // Return original node to notify the driver to stop replacing.
8154  SDValue resNode(AddcNode, 0);
8155  return resNode;
8156}
8157
8158/// PerformADDCCombine - Target-specific dag combine transform from
8159/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
8160static SDValue PerformADDCCombine(SDNode *N,
8161                                 TargetLowering::DAGCombinerInfo &DCI,
8162                                 const ARMSubtarget *Subtarget) {
8163
8164  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
8165
8166}
8167
8168/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
8169/// operands N0 and N1.  This is a helper for PerformADDCombine that is
8170/// called with the default operands, and if that fails, with commuted
8171/// operands.
8172static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
8173                                          TargetLowering::DAGCombinerInfo &DCI,
8174                                          const ARMSubtarget *Subtarget){
8175
8176  // Attempt to create vpaddl for this add.
8177  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
8178  if (Result.getNode())
8179    return Result;
8180
8181  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
8182  if (N0.getNode()->hasOneUse()) {
8183    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
8184    if (Result.getNode()) return Result;
8185  }
8186  return SDValue();
8187}
8188
8189/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
8190///
8191static SDValue PerformADDCombine(SDNode *N,
8192                                 TargetLowering::DAGCombinerInfo &DCI,
8193                                 const ARMSubtarget *Subtarget) {
8194  SDValue N0 = N->getOperand(0);
8195  SDValue N1 = N->getOperand(1);
8196
8197  // First try with the default operand order.
8198  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
8199  if (Result.getNode())
8200    return Result;
8201
8202  // If that didn't work, try again with the operands commuted.
8203  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
8204}
8205
8206/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
8207///
8208static SDValue PerformSUBCombine(SDNode *N,
8209                                 TargetLowering::DAGCombinerInfo &DCI) {
8210  SDValue N0 = N->getOperand(0);
8211  SDValue N1 = N->getOperand(1);
8212
8213  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
8214  if (N1.getNode()->hasOneUse()) {
8215    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
8216    if (Result.getNode()) return Result;
8217  }
8218
8219  return SDValue();
8220}
8221
8222/// PerformVMULCombine
8223/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
8224/// special multiplier accumulator forwarding.
8225///   vmul d3, d0, d2
8226///   vmla d3, d1, d2
8227/// is faster than
8228///   vadd d3, d0, d1
8229///   vmul d3, d3, d2
8230static SDValue PerformVMULCombine(SDNode *N,
8231                                  TargetLowering::DAGCombinerInfo &DCI,
8232                                  const ARMSubtarget *Subtarget) {
8233  if (!Subtarget->hasVMLxForwarding())
8234    return SDValue();
8235
8236  SelectionDAG &DAG = DCI.DAG;
8237  SDValue N0 = N->getOperand(0);
8238  SDValue N1 = N->getOperand(1);
8239  unsigned Opcode = N0.getOpcode();
8240  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8241      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
8242    Opcode = N1.getOpcode();
8243    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
8244        Opcode != ISD::FADD && Opcode != ISD::FSUB)
8245      return SDValue();
8246    std::swap(N0, N1);
8247  }
8248
8249  EVT VT = N->getValueType(0);
8250  SDLoc DL(N);
8251  SDValue N00 = N0->getOperand(0);
8252  SDValue N01 = N0->getOperand(1);
8253  return DAG.getNode(Opcode, DL, VT,
8254                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
8255                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
8256}
8257
8258static SDValue PerformMULCombine(SDNode *N,
8259                                 TargetLowering::DAGCombinerInfo &DCI,
8260                                 const ARMSubtarget *Subtarget) {
8261  SelectionDAG &DAG = DCI.DAG;
8262
8263  if (Subtarget->isThumb1Only())
8264    return SDValue();
8265
8266  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
8267    return SDValue();
8268
8269  EVT VT = N->getValueType(0);
8270  if (VT.is64BitVector() || VT.is128BitVector())
8271    return PerformVMULCombine(N, DCI, Subtarget);
8272  if (VT != MVT::i32)
8273    return SDValue();
8274
8275  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8276  if (!C)
8277    return SDValue();
8278
8279  int64_t MulAmt = C->getSExtValue();
8280  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
8281
8282  ShiftAmt = ShiftAmt & (32 - 1);
8283  SDValue V = N->getOperand(0);
8284  SDLoc DL(N);
8285
8286  SDValue Res;
8287  MulAmt >>= ShiftAmt;
8288
8289  if (MulAmt >= 0) {
8290    if (isPowerOf2_32(MulAmt - 1)) {
8291      // (mul x, 2^N + 1) => (add (shl x, N), x)
8292      Res = DAG.getNode(ISD::ADD, DL, VT,
8293                        V,
8294                        DAG.getNode(ISD::SHL, DL, VT,
8295                                    V,
8296                                    DAG.getConstant(Log2_32(MulAmt - 1),
8297                                                    MVT::i32)));
8298    } else if (isPowerOf2_32(MulAmt + 1)) {
8299      // (mul x, 2^N - 1) => (sub (shl x, N), x)
8300      Res = DAG.getNode(ISD::SUB, DL, VT,
8301                        DAG.getNode(ISD::SHL, DL, VT,
8302                                    V,
8303                                    DAG.getConstant(Log2_32(MulAmt + 1),
8304                                                    MVT::i32)),
8305                        V);
8306    } else
8307      return SDValue();
8308  } else {
8309    uint64_t MulAmtAbs = -MulAmt;
8310    if (isPowerOf2_32(MulAmtAbs + 1)) {
8311      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8312      Res = DAG.getNode(ISD::SUB, DL, VT,
8313                        V,
8314                        DAG.getNode(ISD::SHL, DL, VT,
8315                                    V,
8316                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
8317                                                    MVT::i32)));
8318    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
8319      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8320      Res = DAG.getNode(ISD::ADD, DL, VT,
8321                        V,
8322                        DAG.getNode(ISD::SHL, DL, VT,
8323                                    V,
8324                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
8325                                                    MVT::i32)));
8326      Res = DAG.getNode(ISD::SUB, DL, VT,
8327                        DAG.getConstant(0, MVT::i32),Res);
8328
8329    } else
8330      return SDValue();
8331  }
8332
8333  if (ShiftAmt != 0)
8334    Res = DAG.getNode(ISD::SHL, DL, VT,
8335                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
8336
8337  // Do not add new nodes to DAG combiner worklist.
8338  DCI.CombineTo(N, Res, false);
8339  return SDValue();
8340}
8341
8342static SDValue PerformANDCombine(SDNode *N,
8343                                 TargetLowering::DAGCombinerInfo &DCI,
8344                                 const ARMSubtarget *Subtarget) {
8345
8346  // Attempt to use immediate-form VBIC
8347  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8348  SDLoc dl(N);
8349  EVT VT = N->getValueType(0);
8350  SelectionDAG &DAG = DCI.DAG;
8351
8352  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8353    return SDValue();
8354
8355  APInt SplatBits, SplatUndef;
8356  unsigned SplatBitSize;
8357  bool HasAnyUndefs;
8358  if (BVN &&
8359      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8360    if (SplatBitSize <= 64) {
8361      EVT VbicVT;
8362      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
8363                                      SplatUndef.getZExtValue(), SplatBitSize,
8364                                      DAG, VbicVT, VT.is128BitVector(),
8365                                      OtherModImm);
8366      if (Val.getNode()) {
8367        SDValue Input =
8368          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
8369        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
8370        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
8371      }
8372    }
8373  }
8374
8375  if (!Subtarget->isThumb1Only()) {
8376    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
8377    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
8378    if (Result.getNode())
8379      return Result;
8380  }
8381
8382  return SDValue();
8383}
8384
8385/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
8386static SDValue PerformORCombine(SDNode *N,
8387                                TargetLowering::DAGCombinerInfo &DCI,
8388                                const ARMSubtarget *Subtarget) {
8389  // Attempt to use immediate-form VORR
8390  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
8391  SDLoc dl(N);
8392  EVT VT = N->getValueType(0);
8393  SelectionDAG &DAG = DCI.DAG;
8394
8395  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8396    return SDValue();
8397
8398  APInt SplatBits, SplatUndef;
8399  unsigned SplatBitSize;
8400  bool HasAnyUndefs;
8401  if (BVN && Subtarget->hasNEON() &&
8402      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
8403    if (SplatBitSize <= 64) {
8404      EVT VorrVT;
8405      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
8406                                      SplatUndef.getZExtValue(), SplatBitSize,
8407                                      DAG, VorrVT, VT.is128BitVector(),
8408                                      OtherModImm);
8409      if (Val.getNode()) {
8410        SDValue Input =
8411          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
8412        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
8413        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
8414      }
8415    }
8416  }
8417
8418  if (!Subtarget->isThumb1Only()) {
8419    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
8420    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8421    if (Result.getNode())
8422      return Result;
8423  }
8424
8425  // The code below optimizes (or (and X, Y), Z).
8426  // The AND operand needs to have a single user to make these optimizations
8427  // profitable.
8428  SDValue N0 = N->getOperand(0);
8429  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
8430    return SDValue();
8431  SDValue N1 = N->getOperand(1);
8432
8433  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
8434  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
8435      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
8436    APInt SplatUndef;
8437    unsigned SplatBitSize;
8438    bool HasAnyUndefs;
8439
8440    APInt SplatBits0, SplatBits1;
8441    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
8442    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
8443    // Ensure that the second operand of both ands are constants
8444    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
8445                                      HasAnyUndefs) && !HasAnyUndefs) {
8446        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
8447                                          HasAnyUndefs) && !HasAnyUndefs) {
8448            // Ensure that the bit width of the constants are the same and that
8449            // the splat arguments are logical inverses as per the pattern we
8450            // are trying to simplify.
8451            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
8452                SplatBits0 == ~SplatBits1) {
8453                // Canonicalize the vector type to make instruction selection
8454                // simpler.
8455                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
8456                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
8457                                             N0->getOperand(1),
8458                                             N0->getOperand(0),
8459                                             N1->getOperand(0));
8460                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
8461            }
8462        }
8463    }
8464  }
8465
8466  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
8467  // reasonable.
8468
8469  // BFI is only available on V6T2+
8470  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
8471    return SDValue();
8472
8473  SDLoc DL(N);
8474  // 1) or (and A, mask), val => ARMbfi A, val, mask
8475  //      iff (val & mask) == val
8476  //
8477  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8478  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
8479  //          && mask == ~mask2
8480  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
8481  //          && ~mask == mask2
8482  //  (i.e., copy a bitfield value into another bitfield of the same width)
8483
8484  if (VT != MVT::i32)
8485    return SDValue();
8486
8487  SDValue N00 = N0.getOperand(0);
8488
8489  // The value and the mask need to be constants so we can verify this is
8490  // actually a bitfield set. If the mask is 0xffff, we can do better
8491  // via a movt instruction, so don't use BFI in that case.
8492  SDValue MaskOp = N0.getOperand(1);
8493  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
8494  if (!MaskC)
8495    return SDValue();
8496  unsigned Mask = MaskC->getZExtValue();
8497  if (Mask == 0xffff)
8498    return SDValue();
8499  SDValue Res;
8500  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
8501  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
8502  if (N1C) {
8503    unsigned Val = N1C->getZExtValue();
8504    if ((Val & ~Mask) != Val)
8505      return SDValue();
8506
8507    if (ARM::isBitFieldInvertedMask(Mask)) {
8508      Val >>= countTrailingZeros(~Mask);
8509
8510      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
8511                        DAG.getConstant(Val, MVT::i32),
8512                        DAG.getConstant(Mask, MVT::i32));
8513
8514      // Do not add new nodes to DAG combiner worklist.
8515      DCI.CombineTo(N, Res, false);
8516      return SDValue();
8517    }
8518  } else if (N1.getOpcode() == ISD::AND) {
8519    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
8520    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8521    if (!N11C)
8522      return SDValue();
8523    unsigned Mask2 = N11C->getZExtValue();
8524
8525    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
8526    // as is to match.
8527    if (ARM::isBitFieldInvertedMask(Mask) &&
8528        (Mask == ~Mask2)) {
8529      // The pack halfword instruction works better for masks that fit it,
8530      // so use that when it's available.
8531      if (Subtarget->hasT2ExtractPack() &&
8532          (Mask == 0xffff || Mask == 0xffff0000))
8533        return SDValue();
8534      // 2a
8535      unsigned amt = countTrailingZeros(Mask2);
8536      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
8537                        DAG.getConstant(amt, MVT::i32));
8538      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
8539                        DAG.getConstant(Mask, MVT::i32));
8540      // Do not add new nodes to DAG combiner worklist.
8541      DCI.CombineTo(N, Res, false);
8542      return SDValue();
8543    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
8544               (~Mask == Mask2)) {
8545      // The pack halfword instruction works better for masks that fit it,
8546      // so use that when it's available.
8547      if (Subtarget->hasT2ExtractPack() &&
8548          (Mask2 == 0xffff || Mask2 == 0xffff0000))
8549        return SDValue();
8550      // 2b
8551      unsigned lsb = countTrailingZeros(Mask);
8552      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
8553                        DAG.getConstant(lsb, MVT::i32));
8554      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
8555                        DAG.getConstant(Mask2, MVT::i32));
8556      // Do not add new nodes to DAG combiner worklist.
8557      DCI.CombineTo(N, Res, false);
8558      return SDValue();
8559    }
8560  }
8561
8562  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
8563      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
8564      ARM::isBitFieldInvertedMask(~Mask)) {
8565    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
8566    // where lsb(mask) == #shamt and masked bits of B are known zero.
8567    SDValue ShAmt = N00.getOperand(1);
8568    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
8569    unsigned LSB = countTrailingZeros(Mask);
8570    if (ShAmtC != LSB)
8571      return SDValue();
8572
8573    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
8574                      DAG.getConstant(~Mask, MVT::i32));
8575
8576    // Do not add new nodes to DAG combiner worklist.
8577    DCI.CombineTo(N, Res, false);
8578  }
8579
8580  return SDValue();
8581}
8582
8583static SDValue PerformXORCombine(SDNode *N,
8584                                 TargetLowering::DAGCombinerInfo &DCI,
8585                                 const ARMSubtarget *Subtarget) {
8586  EVT VT = N->getValueType(0);
8587  SelectionDAG &DAG = DCI.DAG;
8588
8589  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8590    return SDValue();
8591
8592  if (!Subtarget->isThumb1Only()) {
8593    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
8594    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
8595    if (Result.getNode())
8596      return Result;
8597  }
8598
8599  return SDValue();
8600}
8601
8602/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
8603/// the bits being cleared by the AND are not demanded by the BFI.
8604static SDValue PerformBFICombine(SDNode *N,
8605                                 TargetLowering::DAGCombinerInfo &DCI) {
8606  SDValue N1 = N->getOperand(1);
8607  if (N1.getOpcode() == ISD::AND) {
8608    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
8609    if (!N11C)
8610      return SDValue();
8611    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
8612    unsigned LSB = countTrailingZeros(~InvMask);
8613    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
8614    unsigned Mask = (1 << Width)-1;
8615    unsigned Mask2 = N11C->getZExtValue();
8616    if ((Mask & (~Mask2)) == 0)
8617      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
8618                             N->getOperand(0), N1.getOperand(0),
8619                             N->getOperand(2));
8620  }
8621  return SDValue();
8622}
8623
8624/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
8625/// ARMISD::VMOVRRD.
8626static SDValue PerformVMOVRRDCombine(SDNode *N,
8627                                     TargetLowering::DAGCombinerInfo &DCI) {
8628  // vmovrrd(vmovdrr x, y) -> x,y
8629  SDValue InDouble = N->getOperand(0);
8630  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
8631    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
8632
8633  // vmovrrd(load f64) -> (load i32), (load i32)
8634  SDNode *InNode = InDouble.getNode();
8635  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
8636      InNode->getValueType(0) == MVT::f64 &&
8637      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
8638      !cast<LoadSDNode>(InNode)->isVolatile()) {
8639    // TODO: Should this be done for non-FrameIndex operands?
8640    LoadSDNode *LD = cast<LoadSDNode>(InNode);
8641
8642    SelectionDAG &DAG = DCI.DAG;
8643    SDLoc DL(LD);
8644    SDValue BasePtr = LD->getBasePtr();
8645    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
8646                                 LD->getPointerInfo(), LD->isVolatile(),
8647                                 LD->isNonTemporal(), LD->isInvariant(),
8648                                 LD->getAlignment());
8649
8650    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
8651                                    DAG.getConstant(4, MVT::i32));
8652    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
8653                                 LD->getPointerInfo(), LD->isVolatile(),
8654                                 LD->isNonTemporal(), LD->isInvariant(),
8655                                 std::min(4U, LD->getAlignment() / 2));
8656
8657    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
8658    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
8659    DCI.RemoveFromWorklist(LD);
8660    DAG.DeleteNode(LD);
8661    return Result;
8662  }
8663
8664  return SDValue();
8665}
8666
8667/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
8668/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
8669static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
8670  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
8671  SDValue Op0 = N->getOperand(0);
8672  SDValue Op1 = N->getOperand(1);
8673  if (Op0.getOpcode() == ISD::BITCAST)
8674    Op0 = Op0.getOperand(0);
8675  if (Op1.getOpcode() == ISD::BITCAST)
8676    Op1 = Op1.getOperand(0);
8677  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
8678      Op0.getNode() == Op1.getNode() &&
8679      Op0.getResNo() == 0 && Op1.getResNo() == 1)
8680    return DAG.getNode(ISD::BITCAST, SDLoc(N),
8681                       N->getValueType(0), Op0.getOperand(0));
8682  return SDValue();
8683}
8684
8685/// PerformSTORECombine - Target-specific dag combine xforms for
8686/// ISD::STORE.
8687static SDValue PerformSTORECombine(SDNode *N,
8688                                   TargetLowering::DAGCombinerInfo &DCI) {
8689  StoreSDNode *St = cast<StoreSDNode>(N);
8690  if (St->isVolatile())
8691    return SDValue();
8692
8693  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
8694  // pack all of the elements in one place.  Next, store to memory in fewer
8695  // chunks.
8696  SDValue StVal = St->getValue();
8697  EVT VT = StVal.getValueType();
8698  if (St->isTruncatingStore() && VT.isVector()) {
8699    SelectionDAG &DAG = DCI.DAG;
8700    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8701    EVT StVT = St->getMemoryVT();
8702    unsigned NumElems = VT.getVectorNumElements();
8703    assert(StVT != VT && "Cannot truncate to the same type");
8704    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
8705    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
8706
8707    // From, To sizes and ElemCount must be pow of two
8708    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
8709
8710    // We are going to use the original vector elt for storing.
8711    // Accumulated smaller vector elements must be a multiple of the store size.
8712    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
8713
8714    unsigned SizeRatio  = FromEltSz / ToEltSz;
8715    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
8716
8717    // Create a type on which we perform the shuffle.
8718    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
8719                                     NumElems*SizeRatio);
8720    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
8721
8722    SDLoc DL(St);
8723    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
8724    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
8725    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
8726
8727    // Can't shuffle using an illegal type.
8728    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
8729
8730    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
8731                                DAG.getUNDEF(WideVec.getValueType()),
8732                                ShuffleVec.data());
8733    // At this point all of the data is stored at the bottom of the
8734    // register. We now need to save it to mem.
8735
8736    // Find the largest store unit
8737    MVT StoreType = MVT::i8;
8738    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
8739         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
8740      MVT Tp = (MVT::SimpleValueType)tp;
8741      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
8742        StoreType = Tp;
8743    }
8744    // Didn't find a legal store type.
8745    if (!TLI.isTypeLegal(StoreType))
8746      return SDValue();
8747
8748    // Bitcast the original vector into a vector of store-size units
8749    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
8750            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
8751    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
8752    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
8753    SmallVector<SDValue, 8> Chains;
8754    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
8755                                        TLI.getPointerTy());
8756    SDValue BasePtr = St->getBasePtr();
8757
8758    // Perform one or more big stores into memory.
8759    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
8760    for (unsigned I = 0; I < E; I++) {
8761      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
8762                                   StoreType, ShuffWide,
8763                                   DAG.getIntPtrConstant(I));
8764      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
8765                                St->getPointerInfo(), St->isVolatile(),
8766                                St->isNonTemporal(), St->getAlignment());
8767      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
8768                            Increment);
8769      Chains.push_back(Ch);
8770    }
8771    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
8772                       Chains.size());
8773  }
8774
8775  if (!ISD::isNormalStore(St))
8776    return SDValue();
8777
8778  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
8779  // ARM stores of arguments in the same cache line.
8780  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
8781      StVal.getNode()->hasOneUse()) {
8782    SelectionDAG  &DAG = DCI.DAG;
8783    SDLoc DL(St);
8784    SDValue BasePtr = St->getBasePtr();
8785    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
8786                                  StVal.getNode()->getOperand(0), BasePtr,
8787                                  St->getPointerInfo(), St->isVolatile(),
8788                                  St->isNonTemporal(), St->getAlignment());
8789
8790    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
8791                                    DAG.getConstant(4, MVT::i32));
8792    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
8793                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
8794                        St->isNonTemporal(),
8795                        std::min(4U, St->getAlignment() / 2));
8796  }
8797
8798  if (StVal.getValueType() != MVT::i64 ||
8799      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8800    return SDValue();
8801
8802  // Bitcast an i64 store extracted from a vector to f64.
8803  // Otherwise, the i64 value will be legalized to a pair of i32 values.
8804  SelectionDAG &DAG = DCI.DAG;
8805  SDLoc dl(StVal);
8806  SDValue IntVec = StVal.getOperand(0);
8807  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
8808                                 IntVec.getValueType().getVectorNumElements());
8809  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
8810  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
8811                               Vec, StVal.getOperand(1));
8812  dl = SDLoc(N);
8813  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
8814  // Make the DAGCombiner fold the bitcasts.
8815  DCI.AddToWorklist(Vec.getNode());
8816  DCI.AddToWorklist(ExtElt.getNode());
8817  DCI.AddToWorklist(V.getNode());
8818  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
8819                      St->getPointerInfo(), St->isVolatile(),
8820                      St->isNonTemporal(), St->getAlignment(),
8821                      St->getTBAAInfo());
8822}
8823
8824/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
8825/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
8826/// i64 vector to have f64 elements, since the value can then be loaded
8827/// directly into a VFP register.
8828static bool hasNormalLoadOperand(SDNode *N) {
8829  unsigned NumElts = N->getValueType(0).getVectorNumElements();
8830  for (unsigned i = 0; i < NumElts; ++i) {
8831    SDNode *Elt = N->getOperand(i).getNode();
8832    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
8833      return true;
8834  }
8835  return false;
8836}
8837
8838/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
8839/// ISD::BUILD_VECTOR.
8840static SDValue PerformBUILD_VECTORCombine(SDNode *N,
8841                                          TargetLowering::DAGCombinerInfo &DCI){
8842  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
8843  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
8844  // into a pair of GPRs, which is fine when the value is used as a scalar,
8845  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
8846  SelectionDAG &DAG = DCI.DAG;
8847  if (N->getNumOperands() == 2) {
8848    SDValue RV = PerformVMOVDRRCombine(N, DAG);
8849    if (RV.getNode())
8850      return RV;
8851  }
8852
8853  // Load i64 elements as f64 values so that type legalization does not split
8854  // them up into i32 values.
8855  EVT VT = N->getValueType(0);
8856  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
8857    return SDValue();
8858  SDLoc dl(N);
8859  SmallVector<SDValue, 8> Ops;
8860  unsigned NumElts = VT.getVectorNumElements();
8861  for (unsigned i = 0; i < NumElts; ++i) {
8862    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
8863    Ops.push_back(V);
8864    // Make the DAGCombiner fold the bitcast.
8865    DCI.AddToWorklist(V.getNode());
8866  }
8867  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
8868  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
8869  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
8870}
8871
8872/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
8873static SDValue
8874PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
8875  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
8876  // At that time, we may have inserted bitcasts from integer to float.
8877  // If these bitcasts have survived DAGCombine, change the lowering of this
8878  // BUILD_VECTOR in something more vector friendly, i.e., that does not
8879  // force to use floating point types.
8880
8881  // Make sure we can change the type of the vector.
8882  // This is possible iff:
8883  // 1. The vector is only used in a bitcast to a integer type. I.e.,
8884  //    1.1. Vector is used only once.
8885  //    1.2. Use is a bit convert to an integer type.
8886  // 2. The size of its operands are 32-bits (64-bits are not legal).
8887  EVT VT = N->getValueType(0);
8888  EVT EltVT = VT.getVectorElementType();
8889
8890  // Check 1.1. and 2.
8891  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
8892    return SDValue();
8893
8894  // By construction, the input type must be float.
8895  assert(EltVT == MVT::f32 && "Unexpected type!");
8896
8897  // Check 1.2.
8898  SDNode *Use = *N->use_begin();
8899  if (Use->getOpcode() != ISD::BITCAST ||
8900      Use->getValueType(0).isFloatingPoint())
8901    return SDValue();
8902
8903  // Check profitability.
8904  // Model is, if more than half of the relevant operands are bitcast from
8905  // i32, turn the build_vector into a sequence of insert_vector_elt.
8906  // Relevant operands are everything that is not statically
8907  // (i.e., at compile time) bitcasted.
8908  unsigned NumOfBitCastedElts = 0;
8909  unsigned NumElts = VT.getVectorNumElements();
8910  unsigned NumOfRelevantElts = NumElts;
8911  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
8912    SDValue Elt = N->getOperand(Idx);
8913    if (Elt->getOpcode() == ISD::BITCAST) {
8914      // Assume only bit cast to i32 will go away.
8915      if (Elt->getOperand(0).getValueType() == MVT::i32)
8916        ++NumOfBitCastedElts;
8917    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
8918      // Constants are statically casted, thus do not count them as
8919      // relevant operands.
8920      --NumOfRelevantElts;
8921  }
8922
8923  // Check if more than half of the elements require a non-free bitcast.
8924  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
8925    return SDValue();
8926
8927  SelectionDAG &DAG = DCI.DAG;
8928  // Create the new vector type.
8929  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
8930  // Check if the type is legal.
8931  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8932  if (!TLI.isTypeLegal(VecVT))
8933    return SDValue();
8934
8935  // Combine:
8936  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
8937  // => BITCAST INSERT_VECTOR_ELT
8938  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
8939  //                      (BITCAST EN), N.
8940  SDValue Vec = DAG.getUNDEF(VecVT);
8941  SDLoc dl(N);
8942  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
8943    SDValue V = N->getOperand(Idx);
8944    if (V.getOpcode() == ISD::UNDEF)
8945      continue;
8946    if (V.getOpcode() == ISD::BITCAST &&
8947        V->getOperand(0).getValueType() == MVT::i32)
8948      // Fold obvious case.
8949      V = V.getOperand(0);
8950    else {
8951      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
8952      // Make the DAGCombiner fold the bitcasts.
8953      DCI.AddToWorklist(V.getNode());
8954    }
8955    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
8956    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
8957  }
8958  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
8959  // Make the DAGCombiner fold the bitcasts.
8960  DCI.AddToWorklist(Vec.getNode());
8961  return Vec;
8962}
8963
8964/// PerformInsertEltCombine - Target-specific dag combine xforms for
8965/// ISD::INSERT_VECTOR_ELT.
8966static SDValue PerformInsertEltCombine(SDNode *N,
8967                                       TargetLowering::DAGCombinerInfo &DCI) {
8968  // Bitcast an i64 load inserted into a vector to f64.
8969  // Otherwise, the i64 value will be legalized to a pair of i32 values.
8970  EVT VT = N->getValueType(0);
8971  SDNode *Elt = N->getOperand(1).getNode();
8972  if (VT.getVectorElementType() != MVT::i64 ||
8973      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
8974    return SDValue();
8975
8976  SelectionDAG &DAG = DCI.DAG;
8977  SDLoc dl(N);
8978  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
8979                                 VT.getVectorNumElements());
8980  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
8981  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
8982  // Make the DAGCombiner fold the bitcasts.
8983  DCI.AddToWorklist(Vec.getNode());
8984  DCI.AddToWorklist(V.getNode());
8985  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
8986                               Vec, V, N->getOperand(2));
8987  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
8988}
8989
8990/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
8991/// ISD::VECTOR_SHUFFLE.
8992static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
8993  // The LLVM shufflevector instruction does not require the shuffle mask
8994  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
8995  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
8996  // operands do not match the mask length, they are extended by concatenating
8997  // them with undef vectors.  That is probably the right thing for other
8998  // targets, but for NEON it is better to concatenate two double-register
8999  // size vector operands into a single quad-register size vector.  Do that
9000  // transformation here:
9001  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
9002  //   shuffle(concat(v1, v2), undef)
9003  SDValue Op0 = N->getOperand(0);
9004  SDValue Op1 = N->getOperand(1);
9005  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
9006      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
9007      Op0.getNumOperands() != 2 ||
9008      Op1.getNumOperands() != 2)
9009    return SDValue();
9010  SDValue Concat0Op1 = Op0.getOperand(1);
9011  SDValue Concat1Op1 = Op1.getOperand(1);
9012  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
9013      Concat1Op1.getOpcode() != ISD::UNDEF)
9014    return SDValue();
9015  // Skip the transformation if any of the types are illegal.
9016  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9017  EVT VT = N->getValueType(0);
9018  if (!TLI.isTypeLegal(VT) ||
9019      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
9020      !TLI.isTypeLegal(Concat1Op1.getValueType()))
9021    return SDValue();
9022
9023  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
9024                                  Op0.getOperand(0), Op1.getOperand(0));
9025  // Translate the shuffle mask.
9026  SmallVector<int, 16> NewMask;
9027  unsigned NumElts = VT.getVectorNumElements();
9028  unsigned HalfElts = NumElts/2;
9029  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
9030  for (unsigned n = 0; n < NumElts; ++n) {
9031    int MaskElt = SVN->getMaskElt(n);
9032    int NewElt = -1;
9033    if (MaskElt < (int)HalfElts)
9034      NewElt = MaskElt;
9035    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
9036      NewElt = HalfElts + MaskElt - NumElts;
9037    NewMask.push_back(NewElt);
9038  }
9039  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
9040                              DAG.getUNDEF(VT), NewMask.data());
9041}
9042
9043/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
9044/// NEON load/store intrinsics to merge base address updates.
9045static SDValue CombineBaseUpdate(SDNode *N,
9046                                 TargetLowering::DAGCombinerInfo &DCI) {
9047  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9048    return SDValue();
9049
9050  SelectionDAG &DAG = DCI.DAG;
9051  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
9052                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
9053  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
9054  SDValue Addr = N->getOperand(AddrOpIdx);
9055
9056  // Search for a use of the address operand that is an increment.
9057  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
9058         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
9059    SDNode *User = *UI;
9060    if (User->getOpcode() != ISD::ADD ||
9061        UI.getUse().getResNo() != Addr.getResNo())
9062      continue;
9063
9064    // Check that the add is independent of the load/store.  Otherwise, folding
9065    // it would create a cycle.
9066    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
9067      continue;
9068
9069    // Find the new opcode for the updating load/store.
9070    bool isLoad = true;
9071    bool isLaneOp = false;
9072    unsigned NewOpc = 0;
9073    unsigned NumVecs = 0;
9074    if (isIntrinsic) {
9075      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
9076      switch (IntNo) {
9077      default: llvm_unreachable("unexpected intrinsic for Neon base update");
9078      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
9079        NumVecs = 1; break;
9080      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
9081        NumVecs = 2; break;
9082      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
9083        NumVecs = 3; break;
9084      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
9085        NumVecs = 4; break;
9086      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
9087        NumVecs = 2; isLaneOp = true; break;
9088      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
9089        NumVecs = 3; isLaneOp = true; break;
9090      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
9091        NumVecs = 4; isLaneOp = true; break;
9092      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
9093        NumVecs = 1; isLoad = false; break;
9094      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
9095        NumVecs = 2; isLoad = false; break;
9096      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
9097        NumVecs = 3; isLoad = false; break;
9098      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
9099        NumVecs = 4; isLoad = false; break;
9100      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
9101        NumVecs = 2; isLoad = false; isLaneOp = true; break;
9102      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
9103        NumVecs = 3; isLoad = false; isLaneOp = true; break;
9104      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
9105        NumVecs = 4; isLoad = false; isLaneOp = true; break;
9106      }
9107    } else {
9108      isLaneOp = true;
9109      switch (N->getOpcode()) {
9110      default: llvm_unreachable("unexpected opcode for Neon base update");
9111      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
9112      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
9113      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
9114      }
9115    }
9116
9117    // Find the size of memory referenced by the load/store.
9118    EVT VecTy;
9119    if (isLoad)
9120      VecTy = N->getValueType(0);
9121    else
9122      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
9123    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
9124    if (isLaneOp)
9125      NumBytes /= VecTy.getVectorNumElements();
9126
9127    // If the increment is a constant, it must match the memory ref size.
9128    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9129    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9130      uint64_t IncVal = CInc->getZExtValue();
9131      if (IncVal != NumBytes)
9132        continue;
9133    } else if (NumBytes >= 3 * 16) {
9134      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
9135      // separate instructions that make it harder to use a non-constant update.
9136      continue;
9137    }
9138
9139    // Create the new updating load/store node.
9140    EVT Tys[6];
9141    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
9142    unsigned n;
9143    for (n = 0; n < NumResultVecs; ++n)
9144      Tys[n] = VecTy;
9145    Tys[n++] = MVT::i32;
9146    Tys[n] = MVT::Other;
9147    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
9148    SmallVector<SDValue, 8> Ops;
9149    Ops.push_back(N->getOperand(0)); // incoming chain
9150    Ops.push_back(N->getOperand(AddrOpIdx));
9151    Ops.push_back(Inc);
9152    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
9153      Ops.push_back(N->getOperand(i));
9154    }
9155    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
9156    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
9157                                           Ops.data(), Ops.size(),
9158                                           MemInt->getMemoryVT(),
9159                                           MemInt->getMemOperand());
9160
9161    // Update the uses.
9162    std::vector<SDValue> NewResults;
9163    for (unsigned i = 0; i < NumResultVecs; ++i) {
9164      NewResults.push_back(SDValue(UpdN.getNode(), i));
9165    }
9166    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
9167    DCI.CombineTo(N, NewResults);
9168    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
9169
9170    break;
9171  }
9172  return SDValue();
9173}
9174
9175/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
9176/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
9177/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
9178/// return true.
9179static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
9180  SelectionDAG &DAG = DCI.DAG;
9181  EVT VT = N->getValueType(0);
9182  // vldN-dup instructions only support 64-bit vectors for N > 1.
9183  if (!VT.is64BitVector())
9184    return false;
9185
9186  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
9187  SDNode *VLD = N->getOperand(0).getNode();
9188  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
9189    return false;
9190  unsigned NumVecs = 0;
9191  unsigned NewOpc = 0;
9192  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
9193  if (IntNo == Intrinsic::arm_neon_vld2lane) {
9194    NumVecs = 2;
9195    NewOpc = ARMISD::VLD2DUP;
9196  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
9197    NumVecs = 3;
9198    NewOpc = ARMISD::VLD3DUP;
9199  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
9200    NumVecs = 4;
9201    NewOpc = ARMISD::VLD4DUP;
9202  } else {
9203    return false;
9204  }
9205
9206  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
9207  // numbers match the load.
9208  unsigned VLDLaneNo =
9209    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
9210  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9211       UI != UE; ++UI) {
9212    // Ignore uses of the chain result.
9213    if (UI.getUse().getResNo() == NumVecs)
9214      continue;
9215    SDNode *User = *UI;
9216    if (User->getOpcode() != ARMISD::VDUPLANE ||
9217        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
9218      return false;
9219  }
9220
9221  // Create the vldN-dup node.
9222  EVT Tys[5];
9223  unsigned n;
9224  for (n = 0; n < NumVecs; ++n)
9225    Tys[n] = VT;
9226  Tys[n] = MVT::Other;
9227  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
9228  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
9229  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
9230  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
9231                                           Ops, 2, VLDMemInt->getMemoryVT(),
9232                                           VLDMemInt->getMemOperand());
9233
9234  // Update the uses.
9235  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
9236       UI != UE; ++UI) {
9237    unsigned ResNo = UI.getUse().getResNo();
9238    // Ignore uses of the chain result.
9239    if (ResNo == NumVecs)
9240      continue;
9241    SDNode *User = *UI;
9242    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
9243  }
9244
9245  // Now the vldN-lane intrinsic is dead except for its chain result.
9246  // Update uses of the chain.
9247  std::vector<SDValue> VLDDupResults;
9248  for (unsigned n = 0; n < NumVecs; ++n)
9249    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
9250  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
9251  DCI.CombineTo(VLD, VLDDupResults);
9252
9253  return true;
9254}
9255
9256/// PerformVDUPLANECombine - Target-specific dag combine xforms for
9257/// ARMISD::VDUPLANE.
9258static SDValue PerformVDUPLANECombine(SDNode *N,
9259                                      TargetLowering::DAGCombinerInfo &DCI) {
9260  SDValue Op = N->getOperand(0);
9261
9262  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
9263  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
9264  if (CombineVLDDUP(N, DCI))
9265    return SDValue(N, 0);
9266
9267  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
9268  // redundant.  Ignore bit_converts for now; element sizes are checked below.
9269  while (Op.getOpcode() == ISD::BITCAST)
9270    Op = Op.getOperand(0);
9271  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
9272    return SDValue();
9273
9274  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
9275  unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
9276  // The canonical VMOV for a zero vector uses a 32-bit element size.
9277  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9278  unsigned EltBits;
9279  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
9280    EltSize = 8;
9281  EVT VT = N->getValueType(0);
9282  if (EltSize > VT.getVectorElementType().getSizeInBits())
9283    return SDValue();
9284
9285  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
9286}
9287
9288// isConstVecPow2 - Return true if each vector element is a power of 2, all
9289// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
9290static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
9291{
9292  integerPart cN;
9293  integerPart c0 = 0;
9294  for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
9295       I != E; I++) {
9296    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
9297    if (!C)
9298      return false;
9299
9300    bool isExact;
9301    APFloat APF = C->getValueAPF();
9302    if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
9303        != APFloat::opOK || !isExact)
9304      return false;
9305
9306    c0 = (I == 0) ? cN : c0;
9307    if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
9308      return false;
9309  }
9310  C = c0;
9311  return true;
9312}
9313
9314/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
9315/// can replace combinations of VMUL and VCVT (floating-point to integer)
9316/// when the VMUL has a constant operand that is a power of 2.
9317///
9318/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9319///  vmul.f32        d16, d17, d16
9320///  vcvt.s32.f32    d16, d16
9321/// becomes:
9322///  vcvt.s32.f32    d16, d16, #3
9323static SDValue PerformVCVTCombine(SDNode *N,
9324                                  TargetLowering::DAGCombinerInfo &DCI,
9325                                  const ARMSubtarget *Subtarget) {
9326  SelectionDAG &DAG = DCI.DAG;
9327  SDValue Op = N->getOperand(0);
9328
9329  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
9330      Op.getOpcode() != ISD::FMUL)
9331    return SDValue();
9332
9333  uint64_t C;
9334  SDValue N0 = Op->getOperand(0);
9335  SDValue ConstVec = Op->getOperand(1);
9336  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
9337
9338  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9339      !isConstVecPow2(ConstVec, isSigned, C))
9340    return SDValue();
9341
9342  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
9343  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
9344  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
9345    // These instructions only exist converting from f32 to i32. We can handle
9346    // smaller integers by generating an extra truncate, but larger ones would
9347    // be lossy.
9348    return SDValue();
9349  }
9350
9351  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
9352    Intrinsic::arm_neon_vcvtfp2fxu;
9353  unsigned NumLanes = Op.getValueType().getVectorNumElements();
9354  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
9355                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9356                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
9357                                 DAG.getConstant(Log2_64(C), MVT::i32));
9358
9359  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9360    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
9361
9362  return FixConv;
9363}
9364
9365/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
9366/// can replace combinations of VCVT (integer to floating-point) and VDIV
9367/// when the VDIV has a constant operand that is a power of 2.
9368///
9369/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
9370///  vcvt.f32.s32    d16, d16
9371///  vdiv.f32        d16, d17, d16
9372/// becomes:
9373///  vcvt.f32.s32    d16, d16, #3
9374static SDValue PerformVDIVCombine(SDNode *N,
9375                                  TargetLowering::DAGCombinerInfo &DCI,
9376                                  const ARMSubtarget *Subtarget) {
9377  SelectionDAG &DAG = DCI.DAG;
9378  SDValue Op = N->getOperand(0);
9379  unsigned OpOpcode = Op.getNode()->getOpcode();
9380
9381  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
9382      (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
9383    return SDValue();
9384
9385  uint64_t C;
9386  SDValue ConstVec = N->getOperand(1);
9387  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
9388
9389  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
9390      !isConstVecPow2(ConstVec, isSigned, C))
9391    return SDValue();
9392
9393  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
9394  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
9395  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
9396    // These instructions only exist converting from i32 to f32. We can handle
9397    // smaller integers by generating an extra extend, but larger ones would
9398    // be lossy.
9399    return SDValue();
9400  }
9401
9402  SDValue ConvInput = Op.getOperand(0);
9403  unsigned NumLanes = Op.getValueType().getVectorNumElements();
9404  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
9405    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
9406                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
9407                            ConvInput);
9408
9409  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
9410    Intrinsic::arm_neon_vcvtfxu2fp;
9411  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
9412                     Op.getValueType(),
9413                     DAG.getConstant(IntrinsicOpcode, MVT::i32),
9414                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
9415}
9416
9417/// Getvshiftimm - Check if this is a valid build_vector for the immediate
9418/// operand of a vector shift operation, where all the elements of the
9419/// build_vector must have the same constant integer value.
9420static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
9421  // Ignore bit_converts.
9422  while (Op.getOpcode() == ISD::BITCAST)
9423    Op = Op.getOperand(0);
9424  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9425  APInt SplatBits, SplatUndef;
9426  unsigned SplatBitSize;
9427  bool HasAnyUndefs;
9428  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
9429                                      HasAnyUndefs, ElementBits) ||
9430      SplatBitSize > ElementBits)
9431    return false;
9432  Cnt = SplatBits.getSExtValue();
9433  return true;
9434}
9435
9436/// isVShiftLImm - Check if this is a valid build_vector for the immediate
9437/// operand of a vector shift left operation.  That value must be in the range:
9438///   0 <= Value < ElementBits for a left shift; or
9439///   0 <= Value <= ElementBits for a long left shift.
9440static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
9441  assert(VT.isVector() && "vector shift count is not a vector type");
9442  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
9443  if (! getVShiftImm(Op, ElementBits, Cnt))
9444    return false;
9445  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
9446}
9447
9448/// isVShiftRImm - Check if this is a valid build_vector for the immediate
9449/// operand of a vector shift right operation.  For a shift opcode, the value
9450/// is positive, but for an intrinsic the value count must be negative. The
9451/// absolute value must be in the range:
9452///   1 <= |Value| <= ElementBits for a right shift; or
9453///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
9454static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
9455                         int64_t &Cnt) {
9456  assert(VT.isVector() && "vector shift count is not a vector type");
9457  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
9458  if (! getVShiftImm(Op, ElementBits, Cnt))
9459    return false;
9460  if (isIntrinsic)
9461    Cnt = -Cnt;
9462  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
9463}
9464
9465/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
9466static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
9467  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9468  switch (IntNo) {
9469  default:
9470    // Don't do anything for most intrinsics.
9471    break;
9472
9473  // Vector shifts: check for immediate versions and lower them.
9474  // Note: This is done during DAG combining instead of DAG legalizing because
9475  // the build_vectors for 64-bit vector element shift counts are generally
9476  // not legal, and it is hard to see their values after they get legalized to
9477  // loads from a constant pool.
9478  case Intrinsic::arm_neon_vshifts:
9479  case Intrinsic::arm_neon_vshiftu:
9480  case Intrinsic::arm_neon_vshiftls:
9481  case Intrinsic::arm_neon_vshiftlu:
9482  case Intrinsic::arm_neon_vshiftn:
9483  case Intrinsic::arm_neon_vrshifts:
9484  case Intrinsic::arm_neon_vrshiftu:
9485  case Intrinsic::arm_neon_vrshiftn:
9486  case Intrinsic::arm_neon_vqshifts:
9487  case Intrinsic::arm_neon_vqshiftu:
9488  case Intrinsic::arm_neon_vqshiftsu:
9489  case Intrinsic::arm_neon_vqshiftns:
9490  case Intrinsic::arm_neon_vqshiftnu:
9491  case Intrinsic::arm_neon_vqshiftnsu:
9492  case Intrinsic::arm_neon_vqrshiftns:
9493  case Intrinsic::arm_neon_vqrshiftnu:
9494  case Intrinsic::arm_neon_vqrshiftnsu: {
9495    EVT VT = N->getOperand(1).getValueType();
9496    int64_t Cnt;
9497    unsigned VShiftOpc = 0;
9498
9499    switch (IntNo) {
9500    case Intrinsic::arm_neon_vshifts:
9501    case Intrinsic::arm_neon_vshiftu:
9502      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
9503        VShiftOpc = ARMISD::VSHL;
9504        break;
9505      }
9506      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
9507        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
9508                     ARMISD::VSHRs : ARMISD::VSHRu);
9509        break;
9510      }
9511      return SDValue();
9512
9513    case Intrinsic::arm_neon_vshiftls:
9514    case Intrinsic::arm_neon_vshiftlu:
9515      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
9516        break;
9517      llvm_unreachable("invalid shift count for vshll intrinsic");
9518
9519    case Intrinsic::arm_neon_vrshifts:
9520    case Intrinsic::arm_neon_vrshiftu:
9521      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
9522        break;
9523      return SDValue();
9524
9525    case Intrinsic::arm_neon_vqshifts:
9526    case Intrinsic::arm_neon_vqshiftu:
9527      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9528        break;
9529      return SDValue();
9530
9531    case Intrinsic::arm_neon_vqshiftsu:
9532      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
9533        break;
9534      llvm_unreachable("invalid shift count for vqshlu intrinsic");
9535
9536    case Intrinsic::arm_neon_vshiftn:
9537    case Intrinsic::arm_neon_vrshiftn:
9538    case Intrinsic::arm_neon_vqshiftns:
9539    case Intrinsic::arm_neon_vqshiftnu:
9540    case Intrinsic::arm_neon_vqshiftnsu:
9541    case Intrinsic::arm_neon_vqrshiftns:
9542    case Intrinsic::arm_neon_vqrshiftnu:
9543    case Intrinsic::arm_neon_vqrshiftnsu:
9544      // Narrowing shifts require an immediate right shift.
9545      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
9546        break;
9547      llvm_unreachable("invalid shift count for narrowing vector shift "
9548                       "intrinsic");
9549
9550    default:
9551      llvm_unreachable("unhandled vector shift");
9552    }
9553
9554    switch (IntNo) {
9555    case Intrinsic::arm_neon_vshifts:
9556    case Intrinsic::arm_neon_vshiftu:
9557      // Opcode already set above.
9558      break;
9559    case Intrinsic::arm_neon_vshiftls:
9560    case Intrinsic::arm_neon_vshiftlu:
9561      if (Cnt == VT.getVectorElementType().getSizeInBits())
9562        VShiftOpc = ARMISD::VSHLLi;
9563      else
9564        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
9565                     ARMISD::VSHLLs : ARMISD::VSHLLu);
9566      break;
9567    case Intrinsic::arm_neon_vshiftn:
9568      VShiftOpc = ARMISD::VSHRN; break;
9569    case Intrinsic::arm_neon_vrshifts:
9570      VShiftOpc = ARMISD::VRSHRs; break;
9571    case Intrinsic::arm_neon_vrshiftu:
9572      VShiftOpc = ARMISD::VRSHRu; break;
9573    case Intrinsic::arm_neon_vrshiftn:
9574      VShiftOpc = ARMISD::VRSHRN; break;
9575    case Intrinsic::arm_neon_vqshifts:
9576      VShiftOpc = ARMISD::VQSHLs; break;
9577    case Intrinsic::arm_neon_vqshiftu:
9578      VShiftOpc = ARMISD::VQSHLu; break;
9579    case Intrinsic::arm_neon_vqshiftsu:
9580      VShiftOpc = ARMISD::VQSHLsu; break;
9581    case Intrinsic::arm_neon_vqshiftns:
9582      VShiftOpc = ARMISD::VQSHRNs; break;
9583    case Intrinsic::arm_neon_vqshiftnu:
9584      VShiftOpc = ARMISD::VQSHRNu; break;
9585    case Intrinsic::arm_neon_vqshiftnsu:
9586      VShiftOpc = ARMISD::VQSHRNsu; break;
9587    case Intrinsic::arm_neon_vqrshiftns:
9588      VShiftOpc = ARMISD::VQRSHRNs; break;
9589    case Intrinsic::arm_neon_vqrshiftnu:
9590      VShiftOpc = ARMISD::VQRSHRNu; break;
9591    case Intrinsic::arm_neon_vqrshiftnsu:
9592      VShiftOpc = ARMISD::VQRSHRNsu; break;
9593    }
9594
9595    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
9596                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
9597  }
9598
9599  case Intrinsic::arm_neon_vshiftins: {
9600    EVT VT = N->getOperand(1).getValueType();
9601    int64_t Cnt;
9602    unsigned VShiftOpc = 0;
9603
9604    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
9605      VShiftOpc = ARMISD::VSLI;
9606    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
9607      VShiftOpc = ARMISD::VSRI;
9608    else {
9609      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
9610    }
9611
9612    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
9613                       N->getOperand(1), N->getOperand(2),
9614                       DAG.getConstant(Cnt, MVT::i32));
9615  }
9616
9617  case Intrinsic::arm_neon_vqrshifts:
9618  case Intrinsic::arm_neon_vqrshiftu:
9619    // No immediate versions of these to check for.
9620    break;
9621  }
9622
9623  return SDValue();
9624}
9625
9626/// PerformShiftCombine - Checks for immediate versions of vector shifts and
9627/// lowers them.  As with the vector shift intrinsics, this is done during DAG
9628/// combining instead of DAG legalizing because the build_vectors for 64-bit
9629/// vector element shift counts are generally not legal, and it is hard to see
9630/// their values after they get legalized to loads from a constant pool.
9631static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
9632                                   const ARMSubtarget *ST) {
9633  EVT VT = N->getValueType(0);
9634  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
9635    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
9636    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
9637    SDValue N1 = N->getOperand(1);
9638    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
9639      SDValue N0 = N->getOperand(0);
9640      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
9641          DAG.MaskedValueIsZero(N0.getOperand(0),
9642                                APInt::getHighBitsSet(32, 16)))
9643        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
9644    }
9645  }
9646
9647  // Nothing to be done for scalar shifts.
9648  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9649  if (!VT.isVector() || !TLI.isTypeLegal(VT))
9650    return SDValue();
9651
9652  assert(ST->hasNEON() && "unexpected vector shift");
9653  int64_t Cnt;
9654
9655  switch (N->getOpcode()) {
9656  default: llvm_unreachable("unexpected shift opcode");
9657
9658  case ISD::SHL:
9659    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
9660      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
9661                         DAG.getConstant(Cnt, MVT::i32));
9662    break;
9663
9664  case ISD::SRA:
9665  case ISD::SRL:
9666    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
9667      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
9668                            ARMISD::VSHRs : ARMISD::VSHRu);
9669      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
9670                         DAG.getConstant(Cnt, MVT::i32));
9671    }
9672  }
9673  return SDValue();
9674}
9675
9676/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
9677/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
9678static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
9679                                    const ARMSubtarget *ST) {
9680  SDValue N0 = N->getOperand(0);
9681
9682  // Check for sign- and zero-extensions of vector extract operations of 8-
9683  // and 16-bit vector elements.  NEON supports these directly.  They are
9684  // handled during DAG combining because type legalization will promote them
9685  // to 32-bit types and it is messy to recognize the operations after that.
9686  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
9687    SDValue Vec = N0.getOperand(0);
9688    SDValue Lane = N0.getOperand(1);
9689    EVT VT = N->getValueType(0);
9690    EVT EltVT = N0.getValueType();
9691    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9692
9693    if (VT == MVT::i32 &&
9694        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
9695        TLI.isTypeLegal(Vec.getValueType()) &&
9696        isa<ConstantSDNode>(Lane)) {
9697
9698      unsigned Opc = 0;
9699      switch (N->getOpcode()) {
9700      default: llvm_unreachable("unexpected opcode");
9701      case ISD::SIGN_EXTEND:
9702        Opc = ARMISD::VGETLANEs;
9703        break;
9704      case ISD::ZERO_EXTEND:
9705      case ISD::ANY_EXTEND:
9706        Opc = ARMISD::VGETLANEu;
9707        break;
9708      }
9709      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
9710    }
9711  }
9712
9713  return SDValue();
9714}
9715
9716/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
9717/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
9718static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
9719                                       const ARMSubtarget *ST) {
9720  // If the target supports NEON, try to use vmax/vmin instructions for f32
9721  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
9722  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
9723  // a NaN; only do the transformation when it matches that behavior.
9724
9725  // For now only do this when using NEON for FP operations; if using VFP, it
9726  // is not obvious that the benefit outweighs the cost of switching to the
9727  // NEON pipeline.
9728  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
9729      N->getValueType(0) != MVT::f32)
9730    return SDValue();
9731
9732  SDValue CondLHS = N->getOperand(0);
9733  SDValue CondRHS = N->getOperand(1);
9734  SDValue LHS = N->getOperand(2);
9735  SDValue RHS = N->getOperand(3);
9736  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
9737
9738  unsigned Opcode = 0;
9739  bool IsReversed;
9740  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
9741    IsReversed = false; // x CC y ? x : y
9742  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
9743    IsReversed = true ; // x CC y ? y : x
9744  } else {
9745    return SDValue();
9746  }
9747
9748  bool IsUnordered;
9749  switch (CC) {
9750  default: break;
9751  case ISD::SETOLT:
9752  case ISD::SETOLE:
9753  case ISD::SETLT:
9754  case ISD::SETLE:
9755  case ISD::SETULT:
9756  case ISD::SETULE:
9757    // If LHS is NaN, an ordered comparison will be false and the result will
9758    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
9759    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
9760    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
9761    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
9762      break;
9763    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
9764    // will return -0, so vmin can only be used for unsafe math or if one of
9765    // the operands is known to be nonzero.
9766    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
9767        !DAG.getTarget().Options.UnsafeFPMath &&
9768        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9769      break;
9770    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
9771    break;
9772
9773  case ISD::SETOGT:
9774  case ISD::SETOGE:
9775  case ISD::SETGT:
9776  case ISD::SETGE:
9777  case ISD::SETUGT:
9778  case ISD::SETUGE:
9779    // If LHS is NaN, an ordered comparison will be false and the result will
9780    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
9781    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
9782    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
9783    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
9784      break;
9785    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
9786    // will return +0, so vmax can only be used for unsafe math or if one of
9787    // the operands is known to be nonzero.
9788    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
9789        !DAG.getTarget().Options.UnsafeFPMath &&
9790        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9791      break;
9792    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
9793    break;
9794  }
9795
9796  if (!Opcode)
9797    return SDValue();
9798  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
9799}
9800
9801/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
9802SDValue
9803ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
9804  SDValue Cmp = N->getOperand(4);
9805  if (Cmp.getOpcode() != ARMISD::CMPZ)
9806    // Only looking at EQ and NE cases.
9807    return SDValue();
9808
9809  EVT VT = N->getValueType(0);
9810  SDLoc dl(N);
9811  SDValue LHS = Cmp.getOperand(0);
9812  SDValue RHS = Cmp.getOperand(1);
9813  SDValue FalseVal = N->getOperand(0);
9814  SDValue TrueVal = N->getOperand(1);
9815  SDValue ARMcc = N->getOperand(2);
9816  ARMCC::CondCodes CC =
9817    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
9818
9819  // Simplify
9820  //   mov     r1, r0
9821  //   cmp     r1, x
9822  //   mov     r0, y
9823  //   moveq   r0, x
9824  // to
9825  //   cmp     r0, x
9826  //   movne   r0, y
9827  //
9828  //   mov     r1, r0
9829  //   cmp     r1, x
9830  //   mov     r0, x
9831  //   movne   r0, y
9832  // to
9833  //   cmp     r0, x
9834  //   movne   r0, y
9835  /// FIXME: Turn this into a target neutral optimization?
9836  SDValue Res;
9837  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
9838    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
9839                      N->getOperand(3), Cmp);
9840  } else if (CC == ARMCC::EQ && TrueVal == RHS) {
9841    SDValue ARMcc;
9842    SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
9843    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
9844                      N->getOperand(3), NewCmp);
9845  }
9846
9847  if (Res.getNode()) {
9848    APInt KnownZero, KnownOne;
9849    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
9850    // Capture demanded bits information that would be otherwise lost.
9851    if (KnownZero == 0xfffffffe)
9852      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9853                        DAG.getValueType(MVT::i1));
9854    else if (KnownZero == 0xffffff00)
9855      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9856                        DAG.getValueType(MVT::i8));
9857    else if (KnownZero == 0xffff0000)
9858      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
9859                        DAG.getValueType(MVT::i16));
9860  }
9861
9862  return Res;
9863}
9864
9865SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
9866                                             DAGCombinerInfo &DCI) const {
9867  switch (N->getOpcode()) {
9868  default: break;
9869  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
9870  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
9871  case ISD::SUB:        return PerformSUBCombine(N, DCI);
9872  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
9873  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
9874  case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
9875  case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
9876  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
9877  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
9878  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
9879  case ISD::STORE:      return PerformSTORECombine(N, DCI);
9880  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
9881  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
9882  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
9883  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
9884  case ISD::FP_TO_SINT:
9885  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
9886  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
9887  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
9888  case ISD::SHL:
9889  case ISD::SRA:
9890  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
9891  case ISD::SIGN_EXTEND:
9892  case ISD::ZERO_EXTEND:
9893  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
9894  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
9895  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
9896  case ARMISD::VLD2DUP:
9897  case ARMISD::VLD3DUP:
9898  case ARMISD::VLD4DUP:
9899    return CombineBaseUpdate(N, DCI);
9900  case ARMISD::BUILD_VECTOR:
9901    return PerformARMBUILD_VECTORCombine(N, DCI);
9902  case ISD::INTRINSIC_VOID:
9903  case ISD::INTRINSIC_W_CHAIN:
9904    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
9905    case Intrinsic::arm_neon_vld1:
9906    case Intrinsic::arm_neon_vld2:
9907    case Intrinsic::arm_neon_vld3:
9908    case Intrinsic::arm_neon_vld4:
9909    case Intrinsic::arm_neon_vld2lane:
9910    case Intrinsic::arm_neon_vld3lane:
9911    case Intrinsic::arm_neon_vld4lane:
9912    case Intrinsic::arm_neon_vst1:
9913    case Intrinsic::arm_neon_vst2:
9914    case Intrinsic::arm_neon_vst3:
9915    case Intrinsic::arm_neon_vst4:
9916    case Intrinsic::arm_neon_vst2lane:
9917    case Intrinsic::arm_neon_vst3lane:
9918    case Intrinsic::arm_neon_vst4lane:
9919      return CombineBaseUpdate(N, DCI);
9920    default: break;
9921    }
9922    break;
9923  }
9924  return SDValue();
9925}
9926
9927bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
9928                                                          EVT VT) const {
9929  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
9930}
9931
9932bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
9933  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
9934  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9935
9936  switch (VT.getSimpleVT().SimpleTy) {
9937  default:
9938    return false;
9939  case MVT::i8:
9940  case MVT::i16:
9941  case MVT::i32: {
9942    // Unaligned access can use (for example) LRDB, LRDH, LDR
9943    if (AllowsUnaligned) {
9944      if (Fast)
9945        *Fast = Subtarget->hasV7Ops();
9946      return true;
9947    }
9948    return false;
9949  }
9950  case MVT::f64:
9951  case MVT::v2f64: {
9952    // For any little-endian targets with neon, we can support unaligned ld/st
9953    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
9954    // A big-endian target may also explictly support unaligned accesses
9955    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
9956      if (Fast)
9957        *Fast = true;
9958      return true;
9959    }
9960    return false;
9961  }
9962  }
9963}
9964
9965static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
9966                       unsigned AlignCheck) {
9967  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
9968          (DstAlign == 0 || DstAlign % AlignCheck == 0));
9969}
9970
9971EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
9972                                           unsigned DstAlign, unsigned SrcAlign,
9973                                           bool IsMemset, bool ZeroMemset,
9974                                           bool MemcpyStrSrc,
9975                                           MachineFunction &MF) const {
9976  const Function *F = MF.getFunction();
9977
9978  // See if we can use NEON instructions for this...
9979  if ((!IsMemset || ZeroMemset) &&
9980      Subtarget->hasNEON() &&
9981      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
9982                                       Attribute::NoImplicitFloat)) {
9983    bool Fast;
9984    if (Size >= 16 &&
9985        (memOpAlign(SrcAlign, DstAlign, 16) ||
9986         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
9987      return MVT::v2f64;
9988    } else if (Size >= 8 &&
9989               (memOpAlign(SrcAlign, DstAlign, 8) ||
9990                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
9991      return MVT::f64;
9992    }
9993  }
9994
9995  // Lowering to i32/i16 if the size permits.
9996  if (Size >= 4)
9997    return MVT::i32;
9998  else if (Size >= 2)
9999    return MVT::i16;
10000
10001  // Let the target-independent logic figure it out.
10002  return MVT::Other;
10003}
10004
10005bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
10006  if (Val.getOpcode() != ISD::LOAD)
10007    return false;
10008
10009  EVT VT1 = Val.getValueType();
10010  if (!VT1.isSimple() || !VT1.isInteger() ||
10011      !VT2.isSimple() || !VT2.isInteger())
10012    return false;
10013
10014  switch (VT1.getSimpleVT().SimpleTy) {
10015  default: break;
10016  case MVT::i1:
10017  case MVT::i8:
10018  case MVT::i16:
10019    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
10020    return true;
10021  }
10022
10023  return false;
10024}
10025
10026bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
10027  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
10028    return false;
10029
10030  if (!isTypeLegal(EVT::getEVT(Ty1)))
10031    return false;
10032
10033  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
10034
10035  // Assuming the caller doesn't have a zeroext or signext return parameter,
10036  // truncation all the way down to i1 is valid.
10037  return true;
10038}
10039
10040
10041static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
10042  if (V < 0)
10043    return false;
10044
10045  unsigned Scale = 1;
10046  switch (VT.getSimpleVT().SimpleTy) {
10047  default: return false;
10048  case MVT::i1:
10049  case MVT::i8:
10050    // Scale == 1;
10051    break;
10052  case MVT::i16:
10053    // Scale == 2;
10054    Scale = 2;
10055    break;
10056  case MVT::i32:
10057    // Scale == 4;
10058    Scale = 4;
10059    break;
10060  }
10061
10062  if ((V & (Scale - 1)) != 0)
10063    return false;
10064  V /= Scale;
10065  return V == (V & ((1LL << 5) - 1));
10066}
10067
10068static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
10069                                      const ARMSubtarget *Subtarget) {
10070  bool isNeg = false;
10071  if (V < 0) {
10072    isNeg = true;
10073    V = - V;
10074  }
10075
10076  switch (VT.getSimpleVT().SimpleTy) {
10077  default: return false;
10078  case MVT::i1:
10079  case MVT::i8:
10080  case MVT::i16:
10081  case MVT::i32:
10082    // + imm12 or - imm8
10083    if (isNeg)
10084      return V == (V & ((1LL << 8) - 1));
10085    return V == (V & ((1LL << 12) - 1));
10086  case MVT::f32:
10087  case MVT::f64:
10088    // Same as ARM mode. FIXME: NEON?
10089    if (!Subtarget->hasVFP2())
10090      return false;
10091    if ((V & 3) != 0)
10092      return false;
10093    V >>= 2;
10094    return V == (V & ((1LL << 8) - 1));
10095  }
10096}
10097
10098/// isLegalAddressImmediate - Return true if the integer value can be used
10099/// as the offset of the target addressing mode for load / store of the
10100/// given type.
10101static bool isLegalAddressImmediate(int64_t V, EVT VT,
10102                                    const ARMSubtarget *Subtarget) {
10103  if (V == 0)
10104    return true;
10105
10106  if (!VT.isSimple())
10107    return false;
10108
10109  if (Subtarget->isThumb1Only())
10110    return isLegalT1AddressImmediate(V, VT);
10111  else if (Subtarget->isThumb2())
10112    return isLegalT2AddressImmediate(V, VT, Subtarget);
10113
10114  // ARM mode.
10115  if (V < 0)
10116    V = - V;
10117  switch (VT.getSimpleVT().SimpleTy) {
10118  default: return false;
10119  case MVT::i1:
10120  case MVT::i8:
10121  case MVT::i32:
10122    // +- imm12
10123    return V == (V & ((1LL << 12) - 1));
10124  case MVT::i16:
10125    // +- imm8
10126    return V == (V & ((1LL << 8) - 1));
10127  case MVT::f32:
10128  case MVT::f64:
10129    if (!Subtarget->hasVFP2()) // FIXME: NEON?
10130      return false;
10131    if ((V & 3) != 0)
10132      return false;
10133    V >>= 2;
10134    return V == (V & ((1LL << 8) - 1));
10135  }
10136}
10137
10138bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
10139                                                      EVT VT) const {
10140  int Scale = AM.Scale;
10141  if (Scale < 0)
10142    return false;
10143
10144  switch (VT.getSimpleVT().SimpleTy) {
10145  default: return false;
10146  case MVT::i1:
10147  case MVT::i8:
10148  case MVT::i16:
10149  case MVT::i32:
10150    if (Scale == 1)
10151      return true;
10152    // r + r << imm
10153    Scale = Scale & ~1;
10154    return Scale == 2 || Scale == 4 || Scale == 8;
10155  case MVT::i64:
10156    // r + r
10157    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10158      return true;
10159    return false;
10160  case MVT::isVoid:
10161    // Note, we allow "void" uses (basically, uses that aren't loads or
10162    // stores), because arm allows folding a scale into many arithmetic
10163    // operations.  This should be made more precise and revisited later.
10164
10165    // Allow r << imm, but the imm has to be a multiple of two.
10166    if (Scale & 1) return false;
10167    return isPowerOf2_32(Scale);
10168  }
10169}
10170
10171/// isLegalAddressingMode - Return true if the addressing mode represented
10172/// by AM is legal for this target, for a load/store of the specified type.
10173bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
10174                                              Type *Ty) const {
10175  EVT VT = getValueType(Ty, true);
10176  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
10177    return false;
10178
10179  // Can never fold addr of global into load/store.
10180  if (AM.BaseGV)
10181    return false;
10182
10183  switch (AM.Scale) {
10184  case 0:  // no scale reg, must be "r+i" or "r", or "i".
10185    break;
10186  case 1:
10187    if (Subtarget->isThumb1Only())
10188      return false;
10189    // FALL THROUGH.
10190  default:
10191    // ARM doesn't support any R+R*scale+imm addr modes.
10192    if (AM.BaseOffs)
10193      return false;
10194
10195    if (!VT.isSimple())
10196      return false;
10197
10198    if (Subtarget->isThumb2())
10199      return isLegalT2ScaledAddressingMode(AM, VT);
10200
10201    int Scale = AM.Scale;
10202    switch (VT.getSimpleVT().SimpleTy) {
10203    default: return false;
10204    case MVT::i1:
10205    case MVT::i8:
10206    case MVT::i32:
10207      if (Scale < 0) Scale = -Scale;
10208      if (Scale == 1)
10209        return true;
10210      // r + r << imm
10211      return isPowerOf2_32(Scale & ~1);
10212    case MVT::i16:
10213    case MVT::i64:
10214      // r + r
10215      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
10216        return true;
10217      return false;
10218
10219    case MVT::isVoid:
10220      // Note, we allow "void" uses (basically, uses that aren't loads or
10221      // stores), because arm allows folding a scale into many arithmetic
10222      // operations.  This should be made more precise and revisited later.
10223
10224      // Allow r << imm, but the imm has to be a multiple of two.
10225      if (Scale & 1) return false;
10226      return isPowerOf2_32(Scale);
10227    }
10228  }
10229  return true;
10230}
10231
10232/// isLegalICmpImmediate - Return true if the specified immediate is legal
10233/// icmp immediate, that is the target has icmp instructions which can compare
10234/// a register against the immediate without having to materialize the
10235/// immediate into a register.
10236bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
10237  // Thumb2 and ARM modes can use cmn for negative immediates.
10238  if (!Subtarget->isThumb())
10239    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
10240  if (Subtarget->isThumb2())
10241    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
10242  // Thumb1 doesn't have cmn, and only 8-bit immediates.
10243  return Imm >= 0 && Imm <= 255;
10244}
10245
10246/// isLegalAddImmediate - Return true if the specified immediate is a legal add
10247/// *or sub* immediate, that is the target has add or sub instructions which can
10248/// add a register with the immediate without having to materialize the
10249/// immediate into a register.
10250bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
10251  // Same encoding for add/sub, just flip the sign.
10252  int64_t AbsImm = llvm::abs64(Imm);
10253  if (!Subtarget->isThumb())
10254    return ARM_AM::getSOImmVal(AbsImm) != -1;
10255  if (Subtarget->isThumb2())
10256    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
10257  // Thumb1 only has 8-bit unsigned immediate.
10258  return AbsImm >= 0 && AbsImm <= 255;
10259}
10260
10261static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
10262                                      bool isSEXTLoad, SDValue &Base,
10263                                      SDValue &Offset, bool &isInc,
10264                                      SelectionDAG &DAG) {
10265  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10266    return false;
10267
10268  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
10269    // AddressingMode 3
10270    Base = Ptr->getOperand(0);
10271    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10272      int RHSC = (int)RHS->getZExtValue();
10273      if (RHSC < 0 && RHSC > -256) {
10274        assert(Ptr->getOpcode() == ISD::ADD);
10275        isInc = false;
10276        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10277        return true;
10278      }
10279    }
10280    isInc = (Ptr->getOpcode() == ISD::ADD);
10281    Offset = Ptr->getOperand(1);
10282    return true;
10283  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
10284    // AddressingMode 2
10285    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10286      int RHSC = (int)RHS->getZExtValue();
10287      if (RHSC < 0 && RHSC > -0x1000) {
10288        assert(Ptr->getOpcode() == ISD::ADD);
10289        isInc = false;
10290        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10291        Base = Ptr->getOperand(0);
10292        return true;
10293      }
10294    }
10295
10296    if (Ptr->getOpcode() == ISD::ADD) {
10297      isInc = true;
10298      ARM_AM::ShiftOpc ShOpcVal=
10299        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
10300      if (ShOpcVal != ARM_AM::no_shift) {
10301        Base = Ptr->getOperand(1);
10302        Offset = Ptr->getOperand(0);
10303      } else {
10304        Base = Ptr->getOperand(0);
10305        Offset = Ptr->getOperand(1);
10306      }
10307      return true;
10308    }
10309
10310    isInc = (Ptr->getOpcode() == ISD::ADD);
10311    Base = Ptr->getOperand(0);
10312    Offset = Ptr->getOperand(1);
10313    return true;
10314  }
10315
10316  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
10317  return false;
10318}
10319
10320static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
10321                                     bool isSEXTLoad, SDValue &Base,
10322                                     SDValue &Offset, bool &isInc,
10323                                     SelectionDAG &DAG) {
10324  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
10325    return false;
10326
10327  Base = Ptr->getOperand(0);
10328  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
10329    int RHSC = (int)RHS->getZExtValue();
10330    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
10331      assert(Ptr->getOpcode() == ISD::ADD);
10332      isInc = false;
10333      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
10334      return true;
10335    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
10336      isInc = Ptr->getOpcode() == ISD::ADD;
10337      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
10338      return true;
10339    }
10340  }
10341
10342  return false;
10343}
10344
10345/// getPreIndexedAddressParts - returns true by value, base pointer and
10346/// offset pointer and addressing mode by reference if the node's address
10347/// can be legally represented as pre-indexed load / store address.
10348bool
10349ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
10350                                             SDValue &Offset,
10351                                             ISD::MemIndexedMode &AM,
10352                                             SelectionDAG &DAG) const {
10353  if (Subtarget->isThumb1Only())
10354    return false;
10355
10356  EVT VT;
10357  SDValue Ptr;
10358  bool isSEXTLoad = false;
10359  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10360    Ptr = LD->getBasePtr();
10361    VT  = LD->getMemoryVT();
10362    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10363  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10364    Ptr = ST->getBasePtr();
10365    VT  = ST->getMemoryVT();
10366  } else
10367    return false;
10368
10369  bool isInc;
10370  bool isLegal = false;
10371  if (Subtarget->isThumb2())
10372    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10373                                       Offset, isInc, DAG);
10374  else
10375    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
10376                                        Offset, isInc, DAG);
10377  if (!isLegal)
10378    return false;
10379
10380  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
10381  return true;
10382}
10383
10384/// getPostIndexedAddressParts - returns true by value, base pointer and
10385/// offset pointer and addressing mode by reference if this node can be
10386/// combined with a load / store to form a post-indexed load / store.
10387bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
10388                                                   SDValue &Base,
10389                                                   SDValue &Offset,
10390                                                   ISD::MemIndexedMode &AM,
10391                                                   SelectionDAG &DAG) const {
10392  if (Subtarget->isThumb1Only())
10393    return false;
10394
10395  EVT VT;
10396  SDValue Ptr;
10397  bool isSEXTLoad = false;
10398  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10399    VT  = LD->getMemoryVT();
10400    Ptr = LD->getBasePtr();
10401    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
10402  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10403    VT  = ST->getMemoryVT();
10404    Ptr = ST->getBasePtr();
10405  } else
10406    return false;
10407
10408  bool isInc;
10409  bool isLegal = false;
10410  if (Subtarget->isThumb2())
10411    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10412                                       isInc, DAG);
10413  else
10414    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
10415                                        isInc, DAG);
10416  if (!isLegal)
10417    return false;
10418
10419  if (Ptr != Base) {
10420    // Swap base ptr and offset to catch more post-index load / store when
10421    // it's legal. In Thumb2 mode, offset must be an immediate.
10422    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
10423        !Subtarget->isThumb2())
10424      std::swap(Base, Offset);
10425
10426    // Post-indexed load / store update the base pointer.
10427    if (Ptr != Base)
10428      return false;
10429  }
10430
10431  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
10432  return true;
10433}
10434
10435void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
10436                                                       APInt &KnownZero,
10437                                                       APInt &KnownOne,
10438                                                       const SelectionDAG &DAG,
10439                                                       unsigned Depth) const {
10440  unsigned BitWidth = KnownOne.getBitWidth();
10441  KnownZero = KnownOne = APInt(BitWidth, 0);
10442  switch (Op.getOpcode()) {
10443  default: break;
10444  case ARMISD::ADDC:
10445  case ARMISD::ADDE:
10446  case ARMISD::SUBC:
10447  case ARMISD::SUBE:
10448    // These nodes' second result is a boolean
10449    if (Op.getResNo() == 0)
10450      break;
10451    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
10452    break;
10453  case ARMISD::CMOV: {
10454    // Bits are known zero/one if known on the LHS and RHS.
10455    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
10456    if (KnownZero == 0 && KnownOne == 0) return;
10457
10458    APInt KnownZeroRHS, KnownOneRHS;
10459    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
10460    KnownZero &= KnownZeroRHS;
10461    KnownOne  &= KnownOneRHS;
10462    return;
10463  }
10464  }
10465}
10466
10467//===----------------------------------------------------------------------===//
10468//                           ARM Inline Assembly Support
10469//===----------------------------------------------------------------------===//
10470
10471bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
10472  // Looking for "rev" which is V6+.
10473  if (!Subtarget->hasV6Ops())
10474    return false;
10475
10476  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10477  std::string AsmStr = IA->getAsmString();
10478  SmallVector<StringRef, 4> AsmPieces;
10479  SplitString(AsmStr, AsmPieces, ";\n");
10480
10481  switch (AsmPieces.size()) {
10482  default: return false;
10483  case 1:
10484    AsmStr = AsmPieces[0];
10485    AsmPieces.clear();
10486    SplitString(AsmStr, AsmPieces, " \t,");
10487
10488    // rev $0, $1
10489    if (AsmPieces.size() == 3 &&
10490        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
10491        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
10492      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10493      if (Ty && Ty->getBitWidth() == 32)
10494        return IntrinsicLowering::LowerToByteSwap(CI);
10495    }
10496    break;
10497  }
10498
10499  return false;
10500}
10501
10502/// getConstraintType - Given a constraint letter, return the type of
10503/// constraint it is for this target.
10504ARMTargetLowering::ConstraintType
10505ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
10506  if (Constraint.size() == 1) {
10507    switch (Constraint[0]) {
10508    default:  break;
10509    case 'l': return C_RegisterClass;
10510    case 'w': return C_RegisterClass;
10511    case 'h': return C_RegisterClass;
10512    case 'x': return C_RegisterClass;
10513    case 't': return C_RegisterClass;
10514    case 'j': return C_Other; // Constant for movw.
10515      // An address with a single base register. Due to the way we
10516      // currently handle addresses it is the same as an 'r' memory constraint.
10517    case 'Q': return C_Memory;
10518    }
10519  } else if (Constraint.size() == 2) {
10520    switch (Constraint[0]) {
10521    default: break;
10522    // All 'U+' constraints are addresses.
10523    case 'U': return C_Memory;
10524    }
10525  }
10526  return TargetLowering::getConstraintType(Constraint);
10527}
10528
10529/// Examine constraint type and operand type and determine a weight value.
10530/// This object must already have been set up with the operand type
10531/// and the current alternative constraint selected.
10532TargetLowering::ConstraintWeight
10533ARMTargetLowering::getSingleConstraintMatchWeight(
10534    AsmOperandInfo &info, const char *constraint) const {
10535  ConstraintWeight weight = CW_Invalid;
10536  Value *CallOperandVal = info.CallOperandVal;
10537    // If we don't have a value, we can't do a match,
10538    // but allow it at the lowest weight.
10539  if (CallOperandVal == NULL)
10540    return CW_Default;
10541  Type *type = CallOperandVal->getType();
10542  // Look at the constraint type.
10543  switch (*constraint) {
10544  default:
10545    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
10546    break;
10547  case 'l':
10548    if (type->isIntegerTy()) {
10549      if (Subtarget->isThumb())
10550        weight = CW_SpecificReg;
10551      else
10552        weight = CW_Register;
10553    }
10554    break;
10555  case 'w':
10556    if (type->isFloatingPointTy())
10557      weight = CW_Register;
10558    break;
10559  }
10560  return weight;
10561}
10562
10563typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
10564RCPair
10565ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10566                                                MVT VT) const {
10567  if (Constraint.size() == 1) {
10568    // GCC ARM Constraint Letters
10569    switch (Constraint[0]) {
10570    case 'l': // Low regs or general regs.
10571      if (Subtarget->isThumb())
10572        return RCPair(0U, &ARM::tGPRRegClass);
10573      return RCPair(0U, &ARM::GPRRegClass);
10574    case 'h': // High regs or no regs.
10575      if (Subtarget->isThumb())
10576        return RCPair(0U, &ARM::hGPRRegClass);
10577      break;
10578    case 'r':
10579      return RCPair(0U, &ARM::GPRRegClass);
10580    case 'w':
10581      if (VT == MVT::f32)
10582        return RCPair(0U, &ARM::SPRRegClass);
10583      if (VT.getSizeInBits() == 64)
10584        return RCPair(0U, &ARM::DPRRegClass);
10585      if (VT.getSizeInBits() == 128)
10586        return RCPair(0U, &ARM::QPRRegClass);
10587      break;
10588    case 'x':
10589      if (VT == MVT::f32)
10590        return RCPair(0U, &ARM::SPR_8RegClass);
10591      if (VT.getSizeInBits() == 64)
10592        return RCPair(0U, &ARM::DPR_8RegClass);
10593      if (VT.getSizeInBits() == 128)
10594        return RCPair(0U, &ARM::QPR_8RegClass);
10595      break;
10596    case 't':
10597      if (VT == MVT::f32)
10598        return RCPair(0U, &ARM::SPRRegClass);
10599      break;
10600    }
10601  }
10602  if (StringRef("{cc}").equals_lower(Constraint))
10603    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
10604
10605  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10606}
10607
10608/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10609/// vector.  If it is invalid, don't add anything to Ops.
10610void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
10611                                                     std::string &Constraint,
10612                                                     std::vector<SDValue>&Ops,
10613                                                     SelectionDAG &DAG) const {
10614  SDValue Result(0, 0);
10615
10616  // Currently only support length 1 constraints.
10617  if (Constraint.length() != 1) return;
10618
10619  char ConstraintLetter = Constraint[0];
10620  switch (ConstraintLetter) {
10621  default: break;
10622  case 'j':
10623  case 'I': case 'J': case 'K': case 'L':
10624  case 'M': case 'N': case 'O':
10625    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
10626    if (!C)
10627      return;
10628
10629    int64_t CVal64 = C->getSExtValue();
10630    int CVal = (int) CVal64;
10631    // None of these constraints allow values larger than 32 bits.  Check
10632    // that the value fits in an int.
10633    if (CVal != CVal64)
10634      return;
10635
10636    switch (ConstraintLetter) {
10637      case 'j':
10638        // Constant suitable for movw, must be between 0 and
10639        // 65535.
10640        if (Subtarget->hasV6T2Ops())
10641          if (CVal >= 0 && CVal <= 65535)
10642            break;
10643        return;
10644      case 'I':
10645        if (Subtarget->isThumb1Only()) {
10646          // This must be a constant between 0 and 255, for ADD
10647          // immediates.
10648          if (CVal >= 0 && CVal <= 255)
10649            break;
10650        } else if (Subtarget->isThumb2()) {
10651          // A constant that can be used as an immediate value in a
10652          // data-processing instruction.
10653          if (ARM_AM::getT2SOImmVal(CVal) != -1)
10654            break;
10655        } else {
10656          // A constant that can be used as an immediate value in a
10657          // data-processing instruction.
10658          if (ARM_AM::getSOImmVal(CVal) != -1)
10659            break;
10660        }
10661        return;
10662
10663      case 'J':
10664        if (Subtarget->isThumb()) {  // FIXME thumb2
10665          // This must be a constant between -255 and -1, for negated ADD
10666          // immediates. This can be used in GCC with an "n" modifier that
10667          // prints the negated value, for use with SUB instructions. It is
10668          // not useful otherwise but is implemented for compatibility.
10669          if (CVal >= -255 && CVal <= -1)
10670            break;
10671        } else {
10672          // This must be a constant between -4095 and 4095. It is not clear
10673          // what this constraint is intended for. Implemented for
10674          // compatibility with GCC.
10675          if (CVal >= -4095 && CVal <= 4095)
10676            break;
10677        }
10678        return;
10679
10680      case 'K':
10681        if (Subtarget->isThumb1Only()) {
10682          // A 32-bit value where only one byte has a nonzero value. Exclude
10683          // zero to match GCC. This constraint is used by GCC internally for
10684          // constants that can be loaded with a move/shift combination.
10685          // It is not useful otherwise but is implemented for compatibility.
10686          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
10687            break;
10688        } else if (Subtarget->isThumb2()) {
10689          // A constant whose bitwise inverse can be used as an immediate
10690          // value in a data-processing instruction. This can be used in GCC
10691          // with a "B" modifier that prints the inverted value, for use with
10692          // BIC and MVN instructions. It is not useful otherwise but is
10693          // implemented for compatibility.
10694          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
10695            break;
10696        } else {
10697          // A constant whose bitwise inverse can be used as an immediate
10698          // value in a data-processing instruction. This can be used in GCC
10699          // with a "B" modifier that prints the inverted value, for use with
10700          // BIC and MVN instructions. It is not useful otherwise but is
10701          // implemented for compatibility.
10702          if (ARM_AM::getSOImmVal(~CVal) != -1)
10703            break;
10704        }
10705        return;
10706
10707      case 'L':
10708        if (Subtarget->isThumb1Only()) {
10709          // This must be a constant between -7 and 7,
10710          // for 3-operand ADD/SUB immediate instructions.
10711          if (CVal >= -7 && CVal < 7)
10712            break;
10713        } else if (Subtarget->isThumb2()) {
10714          // A constant whose negation can be used as an immediate value in a
10715          // data-processing instruction. This can be used in GCC with an "n"
10716          // modifier that prints the negated value, for use with SUB
10717          // instructions. It is not useful otherwise but is implemented for
10718          // compatibility.
10719          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
10720            break;
10721        } else {
10722          // A constant whose negation can be used as an immediate value in a
10723          // data-processing instruction. This can be used in GCC with an "n"
10724          // modifier that prints the negated value, for use with SUB
10725          // instructions. It is not useful otherwise but is implemented for
10726          // compatibility.
10727          if (ARM_AM::getSOImmVal(-CVal) != -1)
10728            break;
10729        }
10730        return;
10731
10732      case 'M':
10733        if (Subtarget->isThumb()) { // FIXME thumb2
10734          // This must be a multiple of 4 between 0 and 1020, for
10735          // ADD sp + immediate.
10736          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
10737            break;
10738        } else {
10739          // A power of two or a constant between 0 and 32.  This is used in
10740          // GCC for the shift amount on shifted register operands, but it is
10741          // useful in general for any shift amounts.
10742          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
10743            break;
10744        }
10745        return;
10746
10747      case 'N':
10748        if (Subtarget->isThumb()) {  // FIXME thumb2
10749          // This must be a constant between 0 and 31, for shift amounts.
10750          if (CVal >= 0 && CVal <= 31)
10751            break;
10752        }
10753        return;
10754
10755      case 'O':
10756        if (Subtarget->isThumb()) {  // FIXME thumb2
10757          // This must be a multiple of 4 between -508 and 508, for
10758          // ADD/SUB sp = sp + immediate.
10759          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
10760            break;
10761        }
10762        return;
10763    }
10764    Result = DAG.getTargetConstant(CVal, Op.getValueType());
10765    break;
10766  }
10767
10768  if (Result.getNode()) {
10769    Ops.push_back(Result);
10770    return;
10771  }
10772  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10773}
10774
10775SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
10776  assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
10777  unsigned Opcode = Op->getOpcode();
10778  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
10779      "Invalid opcode for Div/Rem lowering");
10780  bool isSigned = (Opcode == ISD::SDIVREM);
10781  EVT VT = Op->getValueType(0);
10782  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
10783
10784  RTLIB::Libcall LC;
10785  switch (VT.getSimpleVT().SimpleTy) {
10786  default: llvm_unreachable("Unexpected request for libcall!");
10787  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
10788  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
10789  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
10790  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
10791  }
10792
10793  SDValue InChain = DAG.getEntryNode();
10794
10795  TargetLowering::ArgListTy Args;
10796  TargetLowering::ArgListEntry Entry;
10797  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
10798    EVT ArgVT = Op->getOperand(i).getValueType();
10799    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
10800    Entry.Node = Op->getOperand(i);
10801    Entry.Ty = ArgTy;
10802    Entry.isSExt = isSigned;
10803    Entry.isZExt = !isSigned;
10804    Args.push_back(Entry);
10805  }
10806
10807  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
10808                                         getPointerTy());
10809
10810  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
10811
10812  SDLoc dl(Op);
10813  TargetLowering::
10814  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
10815                    0, getLibcallCallingConv(LC), /*isTailCall=*/false,
10816                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
10817                    Callee, Args, DAG, dl);
10818  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
10819
10820  return CallInfo.first;
10821}
10822
10823bool
10824ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
10825  // The ARM target isn't yet aware of offsets.
10826  return false;
10827}
10828
10829bool ARM::isBitFieldInvertedMask(unsigned v) {
10830  if (v == 0xffffffff)
10831    return false;
10832
10833  // there can be 1's on either or both "outsides", all the "inside"
10834  // bits must be 0's
10835  unsigned TO = CountTrailingOnes_32(v);
10836  unsigned LO = CountLeadingOnes_32(v);
10837  v = (v >> TO) << TO;
10838  v = (v << LO) >> LO;
10839  return v == 0;
10840}
10841
10842/// isFPImmLegal - Returns true if the target can instruction select the
10843/// specified FP immediate natively. If false, the legalizer will
10844/// materialize the FP immediate as a load from a constant pool.
10845bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
10846  if (!Subtarget->hasVFP3())
10847    return false;
10848  if (VT == MVT::f32)
10849    return ARM_AM::getFP32Imm(Imm) != -1;
10850  if (VT == MVT::f64)
10851    return ARM_AM::getFP64Imm(Imm) != -1;
10852  return false;
10853}
10854
10855/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
10856/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
10857/// specified in the intrinsic calls.
10858bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
10859                                           const CallInst &I,
10860                                           unsigned Intrinsic) const {
10861  switch (Intrinsic) {
10862  case Intrinsic::arm_neon_vld1:
10863  case Intrinsic::arm_neon_vld2:
10864  case Intrinsic::arm_neon_vld3:
10865  case Intrinsic::arm_neon_vld4:
10866  case Intrinsic::arm_neon_vld2lane:
10867  case Intrinsic::arm_neon_vld3lane:
10868  case Intrinsic::arm_neon_vld4lane: {
10869    Info.opc = ISD::INTRINSIC_W_CHAIN;
10870    // Conservatively set memVT to the entire set of vectors loaded.
10871    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
10872    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10873    Info.ptrVal = I.getArgOperand(0);
10874    Info.offset = 0;
10875    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
10876    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
10877    Info.vol = false; // volatile loads with NEON intrinsics not supported
10878    Info.readMem = true;
10879    Info.writeMem = false;
10880    return true;
10881  }
10882  case Intrinsic::arm_neon_vst1:
10883  case Intrinsic::arm_neon_vst2:
10884  case Intrinsic::arm_neon_vst3:
10885  case Intrinsic::arm_neon_vst4:
10886  case Intrinsic::arm_neon_vst2lane:
10887  case Intrinsic::arm_neon_vst3lane:
10888  case Intrinsic::arm_neon_vst4lane: {
10889    Info.opc = ISD::INTRINSIC_VOID;
10890    // Conservatively set memVT to the entire set of vectors stored.
10891    unsigned NumElts = 0;
10892    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
10893      Type *ArgTy = I.getArgOperand(ArgI)->getType();
10894      if (!ArgTy->isVectorTy())
10895        break;
10896      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
10897    }
10898    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
10899    Info.ptrVal = I.getArgOperand(0);
10900    Info.offset = 0;
10901    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
10902    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
10903    Info.vol = false; // volatile stores with NEON intrinsics not supported
10904    Info.readMem = false;
10905    Info.writeMem = true;
10906    return true;
10907  }
10908  case Intrinsic::arm_ldrex: {
10909    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
10910    Info.opc = ISD::INTRINSIC_W_CHAIN;
10911    Info.memVT = MVT::getVT(PtrTy->getElementType());
10912    Info.ptrVal = I.getArgOperand(0);
10913    Info.offset = 0;
10914    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
10915    Info.vol = true;
10916    Info.readMem = true;
10917    Info.writeMem = false;
10918    return true;
10919  }
10920  case Intrinsic::arm_strex: {
10921    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
10922    Info.opc = ISD::INTRINSIC_W_CHAIN;
10923    Info.memVT = MVT::getVT(PtrTy->getElementType());
10924    Info.ptrVal = I.getArgOperand(1);
10925    Info.offset = 0;
10926    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
10927    Info.vol = true;
10928    Info.readMem = false;
10929    Info.writeMem = true;
10930    return true;
10931  }
10932  case Intrinsic::arm_strexd: {
10933    Info.opc = ISD::INTRINSIC_W_CHAIN;
10934    Info.memVT = MVT::i64;
10935    Info.ptrVal = I.getArgOperand(2);
10936    Info.offset = 0;
10937    Info.align = 8;
10938    Info.vol = true;
10939    Info.readMem = false;
10940    Info.writeMem = true;
10941    return true;
10942  }
10943  case Intrinsic::arm_ldrexd: {
10944    Info.opc = ISD::INTRINSIC_W_CHAIN;
10945    Info.memVT = MVT::i64;
10946    Info.ptrVal = I.getArgOperand(0);
10947    Info.offset = 0;
10948    Info.align = 8;
10949    Info.vol = true;
10950    Info.readMem = true;
10951    Info.writeMem = false;
10952    return true;
10953  }
10954  default:
10955    break;
10956  }
10957
10958  return false;
10959}
10960