ARMISelLowering.cpp revision 05e80f27148b1dc19925755d56b6466df840da44
1//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that ARM uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-isel"
16#include "ARMISelLowering.h"
17#include "ARM.h"
18#include "ARMCallingConv.h"
19#include "ARMConstantPoolValue.h"
20#include "ARMMachineFunctionInfo.h"
21#include "ARMPerfectShuffle.h"
22#include "ARMSubtarget.h"
23#include "ARMTargetMachine.h"
24#include "ARMTargetObjectFile.h"
25#include "MCTargetDesc/ARMAddressingModes.h"
26#include "llvm/CallingConv.h"
27#include "llvm/Constants.h"
28#include "llvm/Function.h"
29#include "llvm/GlobalValue.h"
30#include "llvm/Instruction.h"
31#include "llvm/Instructions.h"
32#include "llvm/Intrinsics.h"
33#include "llvm/Type.h"
34#include "llvm/CodeGen/CallingConvLower.h"
35#include "llvm/CodeGen/IntrinsicLowering.h"
36#include "llvm/CodeGen/MachineBasicBlock.h"
37#include "llvm/CodeGen/MachineFrameInfo.h"
38#include "llvm/CodeGen/MachineFunction.h"
39#include "llvm/CodeGen/MachineInstrBuilder.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/SelectionDAG.h"
43#include "llvm/MC/MCSectionMachO.h"
44#include "llvm/Target/TargetOptions.h"
45#include "llvm/ADT/StringExtras.h"
46#include "llvm/ADT/Statistic.h"
47#include "llvm/Support/CommandLine.h"
48#include "llvm/Support/ErrorHandling.h"
49#include "llvm/Support/MathExtras.h"
50#include "llvm/Support/raw_ostream.h"
51using namespace llvm;
52
53STATISTIC(NumTailCalls, "Number of tail calls");
54STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
55STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
56
57// This option should go away when tail calls fully work.
58static cl::opt<bool>
59EnableARMTailCalls("arm-tail-calls", cl::Hidden,
60  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
61  cl::init(false));
62
63cl::opt<bool>
64EnableARMLongCalls("arm-long-calls", cl::Hidden,
65  cl::desc("Generate calls via indirect call instructions"),
66  cl::init(false));
67
68static cl::opt<bool>
69ARMInterworking("arm-interworking", cl::Hidden,
70  cl::desc("Enable / disable ARM interworking (for debugging only)"),
71  cl::init(true));
72
73namespace {
74  class ARMCCState : public CCState {
75  public:
76    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
77               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
78               LLVMContext &C, ParmContext PC)
79        : CCState(CC, isVarArg, MF, TM, locs, C) {
80      assert(((PC == Call) || (PC == Prologue)) &&
81             "ARMCCState users must specify whether their context is call"
82             "or prologue generation.");
83      CallOrPrologue = PC;
84    }
85  };
86}
87
88// The APCS parameter registers.
89static const uint16_t GPRArgRegs[] = {
90  ARM::R0, ARM::R1, ARM::R2, ARM::R3
91};
92
93void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
94                                       MVT PromotedBitwiseVT) {
95  if (VT != PromotedLdStVT) {
96    setOperationAction(ISD::LOAD, VT, Promote);
97    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
98
99    setOperationAction(ISD::STORE, VT, Promote);
100    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
101  }
102
103  MVT ElemTy = VT.getVectorElementType();
104  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
105    setOperationAction(ISD::SETCC, VT, Custom);
106  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
107  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
108  if (ElemTy == MVT::i32) {
109    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
110    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
111    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
112    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
113  } else {
114    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
115    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
116    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
117    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
118  }
119  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
120  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
121  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
122  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
123  setOperationAction(ISD::SELECT,            VT, Expand);
124  setOperationAction(ISD::SELECT_CC,         VT, Expand);
125  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
126  if (VT.isInteger()) {
127    setOperationAction(ISD::SHL, VT, Custom);
128    setOperationAction(ISD::SRA, VT, Custom);
129    setOperationAction(ISD::SRL, VT, Custom);
130  }
131
132  // Promote all bit-wise operations.
133  if (VT.isInteger() && VT != PromotedBitwiseVT) {
134    setOperationAction(ISD::AND, VT, Promote);
135    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
136    setOperationAction(ISD::OR,  VT, Promote);
137    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
138    setOperationAction(ISD::XOR, VT, Promote);
139    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
140  }
141
142  // Neon does not support vector divide/remainder operations.
143  setOperationAction(ISD::SDIV, VT, Expand);
144  setOperationAction(ISD::UDIV, VT, Expand);
145  setOperationAction(ISD::FDIV, VT, Expand);
146  setOperationAction(ISD::SREM, VT, Expand);
147  setOperationAction(ISD::UREM, VT, Expand);
148  setOperationAction(ISD::FREM, VT, Expand);
149}
150
151void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
152  addRegisterClass(VT, &ARM::DPRRegClass);
153  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
154}
155
156void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
157  addRegisterClass(VT, &ARM::QPRRegClass);
158  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
159}
160
161static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
162  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
163    return new TargetLoweringObjectFileMachO();
164
165  return new ARMElfTargetObjectFile();
166}
167
168ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
169    : TargetLowering(TM, createTLOF(TM)) {
170  Subtarget = &TM.getSubtarget<ARMSubtarget>();
171  RegInfo = TM.getRegisterInfo();
172  Itins = TM.getInstrItineraryData();
173
174  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
175
176  if (Subtarget->isTargetDarwin()) {
177    // Uses VFP for Thumb libfuncs if available.
178    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
179      // Single-precision floating-point arithmetic.
180      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
181      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
182      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
183      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
184
185      // Double-precision floating-point arithmetic.
186      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
187      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
188      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
189      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
190
191      // Single-precision comparisons.
192      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
193      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
194      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
195      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
196      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
197      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
198      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
199      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
200
201      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
202      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
203      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
204      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
205      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
206      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
207      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
208      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
209
210      // Double-precision comparisons.
211      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
212      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
213      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
214      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
215      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
216      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
217      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
218      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
219
220      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
221      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
222      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
223      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
224      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
225      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
226      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
227      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
228
229      // Floating-point to integer conversions.
230      // i64 conversions are done via library routines even when generating VFP
231      // instructions, so use the same ones.
232      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
233      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
234      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
235      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
236
237      // Conversions between floating types.
238      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
239      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
240
241      // Integer to floating-point conversions.
242      // i64 conversions are done via library routines even when generating VFP
243      // instructions, so use the same ones.
244      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
245      // e.g., __floatunsidf vs. __floatunssidfvfp.
246      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
247      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
248      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
249      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
250    }
251  }
252
253  // These libcalls are not available in 32-bit.
254  setLibcallName(RTLIB::SHL_I128, 0);
255  setLibcallName(RTLIB::SRL_I128, 0);
256  setLibcallName(RTLIB::SRA_I128, 0);
257
258  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
259    // Double-precision floating-point arithmetic helper functions
260    // RTABI chapter 4.1.2, Table 2
261    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
262    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
263    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
264    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
265    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
266    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
267    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
268    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
269
270    // Double-precision floating-point comparison helper functions
271    // RTABI chapter 4.1.2, Table 3
272    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
273    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
274    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
275    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
276    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
277    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
278    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
279    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
280    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
281    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
282    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
283    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
284    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
285    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
286    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
287    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
288    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
289    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
290    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
291    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
292    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
293    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
294    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
295    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
296
297    // Single-precision floating-point arithmetic helper functions
298    // RTABI chapter 4.1.2, Table 4
299    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
300    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
301    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
302    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
303    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
304    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
305    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
306    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
307
308    // Single-precision floating-point comparison helper functions
309    // RTABI chapter 4.1.2, Table 5
310    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
311    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
312    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
313    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
314    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
315    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
316    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
317    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
318    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
319    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
320    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
321    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
322    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
323    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
324    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
325    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
326    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
327    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
328    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
329    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
330    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
331    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
332    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
333    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
334
335    // Floating-point to integer conversions.
336    // RTABI chapter 4.1.2, Table 6
337    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
338    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
339    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
340    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
341    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
342    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
343    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
344    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
345    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
346    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
347    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
348    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
349    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
350    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
351    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
352    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
353
354    // Conversions between floating types.
355    // RTABI chapter 4.1.2, Table 7
356    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
357    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
358    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
359    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
360
361    // Integer to floating-point conversions.
362    // RTABI chapter 4.1.2, Table 8
363    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
364    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
365    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
366    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
367    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
368    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
369    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
370    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
371    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
372    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
373    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
374    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
375    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
376    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
377    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
378    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
379
380    // Long long helper functions
381    // RTABI chapter 4.2, Table 9
382    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
383    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
384    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
385    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
386    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
387    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
388    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
389    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
390    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
391    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
392
393    // Integer division functions
394    // RTABI chapter 4.3.1
395    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
396    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
397    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
398    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
399    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
400    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
401    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
402    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
403    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
404    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
405    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
406    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
407    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
408    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
409    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
410    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
411
412    // Memory operations
413    // RTABI chapter 4.3.4
414    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
415    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
416    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
417    setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
418    setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
419    setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
420  }
421
422  // Use divmod compiler-rt calls for iOS 5.0 and later.
423  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
424      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
425    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
426    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
427  }
428
429  if (Subtarget->isThumb1Only())
430    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
431  else
432    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
433  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
434      !Subtarget->isThumb1Only()) {
435    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
436    if (!Subtarget->isFPOnlySP())
437      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
438
439    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
440  }
441
442  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
443       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
444    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
445         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
446      setTruncStoreAction((MVT::SimpleValueType)VT,
447                          (MVT::SimpleValueType)InnerVT, Expand);
448    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
449    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
450    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
451  }
452
453  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
454
455  if (Subtarget->hasNEON()) {
456    addDRTypeForNEON(MVT::v2f32);
457    addDRTypeForNEON(MVT::v8i8);
458    addDRTypeForNEON(MVT::v4i16);
459    addDRTypeForNEON(MVT::v2i32);
460    addDRTypeForNEON(MVT::v1i64);
461
462    addQRTypeForNEON(MVT::v4f32);
463    addQRTypeForNEON(MVT::v2f64);
464    addQRTypeForNEON(MVT::v16i8);
465    addQRTypeForNEON(MVT::v8i16);
466    addQRTypeForNEON(MVT::v4i32);
467    addQRTypeForNEON(MVT::v2i64);
468
469    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
470    // neither Neon nor VFP support any arithmetic operations on it.
471    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
472    // supported for v4f32.
473    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
474    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
475    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
476    // FIXME: Code duplication: FDIV and FREM are expanded always, see
477    // ARMTargetLowering::addTypeForNEON method for details.
478    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
479    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
480    // FIXME: Create unittest.
481    // In another words, find a way when "copysign" appears in DAG with vector
482    // operands.
483    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
484    // FIXME: Code duplication: SETCC has custom operation action, see
485    // ARMTargetLowering::addTypeForNEON method for details.
486    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
487    // FIXME: Create unittest for FNEG and for FABS.
488    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
489    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
490    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
491    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
492    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
493    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
494    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
495    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
496    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
497    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
498    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
499    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
500    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
501    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
502    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
503    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
504    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
505    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
506
507    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
508    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
509    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
510    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
511    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
512    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
513    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
514    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
515    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
516    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
517
518    // Neon does not support some operations on v1i64 and v2i64 types.
519    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
520    // Custom handling for some quad-vector types to detect VMULL.
521    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
522    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
523    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
524    // Custom handling for some vector types to avoid expensive expansions
525    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
526    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
527    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
528    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
529    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
530    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
531    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
532    // a destination type that is wider than the source, and nor does
533    // it have a FP_TO_[SU]INT instruction with a narrower destination than
534    // source.
535    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
536    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
537    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
538    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
539
540    setTargetDAGCombine(ISD::INTRINSIC_VOID);
541    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
542    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
543    setTargetDAGCombine(ISD::SHL);
544    setTargetDAGCombine(ISD::SRL);
545    setTargetDAGCombine(ISD::SRA);
546    setTargetDAGCombine(ISD::SIGN_EXTEND);
547    setTargetDAGCombine(ISD::ZERO_EXTEND);
548    setTargetDAGCombine(ISD::ANY_EXTEND);
549    setTargetDAGCombine(ISD::SELECT_CC);
550    setTargetDAGCombine(ISD::BUILD_VECTOR);
551    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
552    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
553    setTargetDAGCombine(ISD::STORE);
554    setTargetDAGCombine(ISD::FP_TO_SINT);
555    setTargetDAGCombine(ISD::FP_TO_UINT);
556    setTargetDAGCombine(ISD::FDIV);
557
558    // It is legal to extload from v4i8 to v4i16 or v4i32.
559    MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
560                  MVT::v4i16, MVT::v2i16,
561                  MVT::v2i32};
562    for (unsigned i = 0; i < 6; ++i) {
563      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
564      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
565      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
566    }
567  }
568
569  computeRegisterProperties();
570
571  // ARM does not have f32 extending load.
572  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
573
574  // ARM does not have i1 sign extending load.
575  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
576
577  // ARM supports all 4 flavors of integer indexed load / store.
578  if (!Subtarget->isThumb1Only()) {
579    for (unsigned im = (unsigned)ISD::PRE_INC;
580         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
581      setIndexedLoadAction(im,  MVT::i1,  Legal);
582      setIndexedLoadAction(im,  MVT::i8,  Legal);
583      setIndexedLoadAction(im,  MVT::i16, Legal);
584      setIndexedLoadAction(im,  MVT::i32, Legal);
585      setIndexedStoreAction(im, MVT::i1,  Legal);
586      setIndexedStoreAction(im, MVT::i8,  Legal);
587      setIndexedStoreAction(im, MVT::i16, Legal);
588      setIndexedStoreAction(im, MVT::i32, Legal);
589    }
590  }
591
592  // i64 operation support.
593  setOperationAction(ISD::MUL,     MVT::i64, Expand);
594  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
595  if (Subtarget->isThumb1Only()) {
596    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
597    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
598  }
599  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
600      || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
601    setOperationAction(ISD::MULHS, MVT::i32, Expand);
602
603  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
604  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
605  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
606  setOperationAction(ISD::SRL,       MVT::i64, Custom);
607  setOperationAction(ISD::SRA,       MVT::i64, Custom);
608
609  if (!Subtarget->isThumb1Only()) {
610    // FIXME: We should do this for Thumb1 as well.
611    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
612    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
613    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
614    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
615  }
616
617  // ARM does not have ROTL.
618  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
619  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
620  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
621  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
622    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
623
624  // These just redirect to CTTZ and CTLZ on ARM.
625  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
626  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
627
628  // Only ARMv6 has BSWAP.
629  if (!Subtarget->hasV6Ops())
630    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
631
632  // These are expanded into libcalls.
633  if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
634    // v7M has a hardware divider
635    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
636    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
637  }
638  setOperationAction(ISD::SREM,  MVT::i32, Expand);
639  setOperationAction(ISD::UREM,  MVT::i32, Expand);
640  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
641  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
642
643  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
644  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
645  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
646  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
647  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
648
649  setOperationAction(ISD::TRAP, MVT::Other, Legal);
650
651  // Use the default implementation.
652  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
653  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
654  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
655  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
656  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
657  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
658
659  if (!Subtarget->isTargetDarwin()) {
660    // Non-Darwin platforms may return values in these registers via the
661    // personality function.
662    setOperationAction(ISD::EHSELECTION,      MVT::i32,   Expand);
663    setOperationAction(ISD::EXCEPTIONADDR,    MVT::i32,   Expand);
664    setExceptionPointerRegister(ARM::R0);
665    setExceptionSelectorRegister(ARM::R1);
666  }
667
668  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
669  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
670  // the default expansion.
671  // FIXME: This should be checking for v6k, not just v6.
672  if (Subtarget->hasDataBarrier() ||
673      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
674    // membarrier needs custom lowering; the rest are legal and handled
675    // normally.
676    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
677    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
678    // Custom lowering for 64-bit ops
679    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
680    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
681    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
682    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
683    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
684    setOperationAction(ISD::ATOMIC_SWAP,  MVT::i64, Custom);
685    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
686    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
687    setInsertFencesForAtomic(true);
688  } else {
689    // Set them all for expansion, which will force libcalls.
690    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
691    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
692    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
693    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
694    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
695    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
696    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
697    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
698    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
699    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
700    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
701    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
702    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
703    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
704    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
705    // Unordered/Monotonic case.
706    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
707    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
708    // Since the libcalls include locking, fold in the fences
709    setShouldFoldAtomicFences(true);
710  }
711
712  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
713
714  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
715  if (!Subtarget->hasV6Ops()) {
716    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
717    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
718  }
719  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
720
721  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
722      !Subtarget->isThumb1Only()) {
723    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
724    // iff target supports vfp2.
725    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
726    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
727  }
728
729  // We want to custom lower some of our intrinsics.
730  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
731  if (Subtarget->isTargetDarwin()) {
732    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
733    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
734    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
735  }
736
737  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
738  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
739  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
740  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
741  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
742  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
743  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
744  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
745  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
746
747  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
748  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
749  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
750  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
751  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
752
753  // We don't support sin/cos/fmod/copysign/pow
754  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
755  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
756  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
757  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
758  setOperationAction(ISD::FREM,      MVT::f64, Expand);
759  setOperationAction(ISD::FREM,      MVT::f32, Expand);
760  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
761      !Subtarget->isThumb1Only()) {
762    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
763    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
764  }
765  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
766  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
767
768  if (!Subtarget->hasVFP4()) {
769    setOperationAction(ISD::FMA, MVT::f64, Expand);
770    setOperationAction(ISD::FMA, MVT::f32, Expand);
771  }
772
773  // Various VFP goodness
774  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
775    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
776    if (Subtarget->hasVFP2()) {
777      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
778      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
779      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
780      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
781    }
782    // Special handling for half-precision FP.
783    if (!Subtarget->hasFP16()) {
784      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
785      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
786    }
787  }
788
789  // We have target-specific dag combine patterns for the following nodes:
790  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
791  setTargetDAGCombine(ISD::ADD);
792  setTargetDAGCombine(ISD::SUB);
793  setTargetDAGCombine(ISD::MUL);
794
795  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) {
796    setTargetDAGCombine(ISD::AND);
797    setTargetDAGCombine(ISD::OR);
798    setTargetDAGCombine(ISD::XOR);
799  }
800
801  if (Subtarget->hasV6Ops())
802    setTargetDAGCombine(ISD::SRL);
803
804  setStackPointerRegisterToSaveRestore(ARM::SP);
805
806  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
807      !Subtarget->hasVFP2())
808    setSchedulingPreference(Sched::RegPressure);
809  else
810    setSchedulingPreference(Sched::Hybrid);
811
812  //// temporary - rewrite interface to use type
813  maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
814  maxStoresPerMemset = 16;
815  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
816
817  // On ARM arguments smaller than 4 bytes are extended, so all arguments
818  // are at least 4 bytes aligned.
819  setMinStackArgumentAlignment(4);
820
821  benefitFromCodePlacementOpt = true;
822
823  // Prefer likely predicted branches to selects on out-of-order cores.
824  predictableSelectIsExpensive = Subtarget->isCortexA9();
825
826  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
827}
828
829// FIXME: It might make sense to define the representative register class as the
830// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
831// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
832// SPR's representative would be DPR_VFP2. This should work well if register
833// pressure tracking were modified such that a register use would increment the
834// pressure of the register class's representative and all of it's super
835// classes' representatives transitively. We have not implemented this because
836// of the difficulty prior to coalescing of modeling operand register classes
837// due to the common occurrence of cross class copies and subregister insertions
838// and extractions.
839std::pair<const TargetRegisterClass*, uint8_t>
840ARMTargetLowering::findRepresentativeClass(EVT VT) const{
841  const TargetRegisterClass *RRC = 0;
842  uint8_t Cost = 1;
843  switch (VT.getSimpleVT().SimpleTy) {
844  default:
845    return TargetLowering::findRepresentativeClass(VT);
846  // Use DPR as representative register class for all floating point
847  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
848  // the cost is 1 for both f32 and f64.
849  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
850  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
851    RRC = &ARM::DPRRegClass;
852    // When NEON is used for SP, only half of the register file is available
853    // because operations that define both SP and DP results will be constrained
854    // to the VFP2 class (D0-D15). We currently model this constraint prior to
855    // coalescing by double-counting the SP regs. See the FIXME above.
856    if (Subtarget->useNEONForSinglePrecisionFP())
857      Cost = 2;
858    break;
859  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
860  case MVT::v4f32: case MVT::v2f64:
861    RRC = &ARM::DPRRegClass;
862    Cost = 2;
863    break;
864  case MVT::v4i64:
865    RRC = &ARM::DPRRegClass;
866    Cost = 4;
867    break;
868  case MVT::v8i64:
869    RRC = &ARM::DPRRegClass;
870    Cost = 8;
871    break;
872  }
873  return std::make_pair(RRC, Cost);
874}
875
876const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
877  switch (Opcode) {
878  default: return 0;
879  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
880  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
881  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
882  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
883  case ARMISD::CALL:          return "ARMISD::CALL";
884  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
885  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
886  case ARMISD::tCALL:         return "ARMISD::tCALL";
887  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
888  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
889  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
890  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
891  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
892  case ARMISD::CMP:           return "ARMISD::CMP";
893  case ARMISD::CMN:           return "ARMISD::CMN";
894  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
895  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
896  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
897  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
898  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
899
900  case ARMISD::CMOV:          return "ARMISD::CMOV";
901
902  case ARMISD::RBIT:          return "ARMISD::RBIT";
903
904  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
905  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
906  case ARMISD::SITOF:         return "ARMISD::SITOF";
907  case ARMISD::UITOF:         return "ARMISD::UITOF";
908
909  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
910  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
911  case ARMISD::RRX:           return "ARMISD::RRX";
912
913  case ARMISD::ADDC:          return "ARMISD::ADDC";
914  case ARMISD::ADDE:          return "ARMISD::ADDE";
915  case ARMISD::SUBC:          return "ARMISD::SUBC";
916  case ARMISD::SUBE:          return "ARMISD::SUBE";
917
918  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
919  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
920
921  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
922  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
923
924  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
925
926  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
927
928  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
929
930  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
931  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
932
933  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
934
935  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
936  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
937  case ARMISD::VCGE:          return "ARMISD::VCGE";
938  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
939  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
940  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
941  case ARMISD::VCGT:          return "ARMISD::VCGT";
942  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
943  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
944  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
945  case ARMISD::VTST:          return "ARMISD::VTST";
946
947  case ARMISD::VSHL:          return "ARMISD::VSHL";
948  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
949  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
950  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
951  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
952  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
953  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
954  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
955  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
956  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
957  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
958  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
959  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
960  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
961  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
962  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
963  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
964  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
965  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
966  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
967  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
968  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
969  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
970  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
971  case ARMISD::VDUP:          return "ARMISD::VDUP";
972  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
973  case ARMISD::VEXT:          return "ARMISD::VEXT";
974  case ARMISD::VREV64:        return "ARMISD::VREV64";
975  case ARMISD::VREV32:        return "ARMISD::VREV32";
976  case ARMISD::VREV16:        return "ARMISD::VREV16";
977  case ARMISD::VZIP:          return "ARMISD::VZIP";
978  case ARMISD::VUZP:          return "ARMISD::VUZP";
979  case ARMISD::VTRN:          return "ARMISD::VTRN";
980  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
981  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
982  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
983  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
984  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
985  case ARMISD::FMAX:          return "ARMISD::FMAX";
986  case ARMISD::FMIN:          return "ARMISD::FMIN";
987  case ARMISD::BFI:           return "ARMISD::BFI";
988  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
989  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
990  case ARMISD::VBSL:          return "ARMISD::VBSL";
991  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
992  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
993  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
994  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
995  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
996  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
997  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
998  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
999  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1000  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1001  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1002  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1003  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1004  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1005  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1006  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1007  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1008  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1009  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1010  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1011  }
1012}
1013
1014EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
1015  if (!VT.isVector()) return getPointerTy();
1016  return VT.changeVectorElementTypeToInteger();
1017}
1018
1019/// getRegClassFor - Return the register class that should be used for the
1020/// specified value type.
1021const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
1022  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1023  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1024  // load / store 4 to 8 consecutive D registers.
1025  if (Subtarget->hasNEON()) {
1026    if (VT == MVT::v4i64)
1027      return &ARM::QQPRRegClass;
1028    if (VT == MVT::v8i64)
1029      return &ARM::QQQQPRRegClass;
1030  }
1031  return TargetLowering::getRegClassFor(VT);
1032}
1033
1034// Create a fast isel object.
1035FastISel *
1036ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1037                                  const TargetLibraryInfo *libInfo) const {
1038  return ARM::createFastISel(funcInfo, libInfo);
1039}
1040
1041/// getMaximalGlobalOffset - Returns the maximal possible offset which can
1042/// be used for loads / stores from the global.
1043unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
1044  return (Subtarget->isThumb1Only() ? 127 : 4095);
1045}
1046
1047Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1048  unsigned NumVals = N->getNumValues();
1049  if (!NumVals)
1050    return Sched::RegPressure;
1051
1052  for (unsigned i = 0; i != NumVals; ++i) {
1053    EVT VT = N->getValueType(i);
1054    if (VT == MVT::Glue || VT == MVT::Other)
1055      continue;
1056    if (VT.isFloatingPoint() || VT.isVector())
1057      return Sched::ILP;
1058  }
1059
1060  if (!N->isMachineOpcode())
1061    return Sched::RegPressure;
1062
1063  // Load are scheduled for latency even if there instruction itinerary
1064  // is not available.
1065  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
1066  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1067
1068  if (MCID.getNumDefs() == 0)
1069    return Sched::RegPressure;
1070  if (!Itins->isEmpty() &&
1071      Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1072    return Sched::ILP;
1073
1074  return Sched::RegPressure;
1075}
1076
1077//===----------------------------------------------------------------------===//
1078// Lowering Code
1079//===----------------------------------------------------------------------===//
1080
1081/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1082static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1083  switch (CC) {
1084  default: llvm_unreachable("Unknown condition code!");
1085  case ISD::SETNE:  return ARMCC::NE;
1086  case ISD::SETEQ:  return ARMCC::EQ;
1087  case ISD::SETGT:  return ARMCC::GT;
1088  case ISD::SETGE:  return ARMCC::GE;
1089  case ISD::SETLT:  return ARMCC::LT;
1090  case ISD::SETLE:  return ARMCC::LE;
1091  case ISD::SETUGT: return ARMCC::HI;
1092  case ISD::SETUGE: return ARMCC::HS;
1093  case ISD::SETULT: return ARMCC::LO;
1094  case ISD::SETULE: return ARMCC::LS;
1095  }
1096}
1097
1098/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1099static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1100                        ARMCC::CondCodes &CondCode2) {
1101  CondCode2 = ARMCC::AL;
1102  switch (CC) {
1103  default: llvm_unreachable("Unknown FP condition!");
1104  case ISD::SETEQ:
1105  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1106  case ISD::SETGT:
1107  case ISD::SETOGT: CondCode = ARMCC::GT; break;
1108  case ISD::SETGE:
1109  case ISD::SETOGE: CondCode = ARMCC::GE; break;
1110  case ISD::SETOLT: CondCode = ARMCC::MI; break;
1111  case ISD::SETOLE: CondCode = ARMCC::LS; break;
1112  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1113  case ISD::SETO:   CondCode = ARMCC::VC; break;
1114  case ISD::SETUO:  CondCode = ARMCC::VS; break;
1115  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1116  case ISD::SETUGT: CondCode = ARMCC::HI; break;
1117  case ISD::SETUGE: CondCode = ARMCC::PL; break;
1118  case ISD::SETLT:
1119  case ISD::SETULT: CondCode = ARMCC::LT; break;
1120  case ISD::SETLE:
1121  case ISD::SETULE: CondCode = ARMCC::LE; break;
1122  case ISD::SETNE:
1123  case ISD::SETUNE: CondCode = ARMCC::NE; break;
1124  }
1125}
1126
1127//===----------------------------------------------------------------------===//
1128//                      Calling Convention Implementation
1129//===----------------------------------------------------------------------===//
1130
1131#include "ARMGenCallingConv.inc"
1132
1133/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1134/// given CallingConvention value.
1135CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1136                                                 bool Return,
1137                                                 bool isVarArg) const {
1138  switch (CC) {
1139  default:
1140    llvm_unreachable("Unsupported calling convention");
1141  case CallingConv::Fast:
1142    if (Subtarget->hasVFP2() && !isVarArg) {
1143      if (!Subtarget->isAAPCS_ABI())
1144        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1145      // For AAPCS ABI targets, just use VFP variant of the calling convention.
1146      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1147    }
1148    // Fallthrough
1149  case CallingConv::C: {
1150    // Use target triple & subtarget features to do actual dispatch.
1151    if (!Subtarget->isAAPCS_ABI())
1152      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1153    else if (Subtarget->hasVFP2() &&
1154             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1155             !isVarArg)
1156      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1157    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1158  }
1159  case CallingConv::ARM_AAPCS_VFP:
1160    if (!isVarArg)
1161      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1162    // Fallthrough
1163  case CallingConv::ARM_AAPCS:
1164    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1165  case CallingConv::ARM_APCS:
1166    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1167  case CallingConv::GHC:
1168    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1169  }
1170}
1171
1172/// LowerCallResult - Lower the result values of a call into the
1173/// appropriate copies out of appropriate physical registers.
1174SDValue
1175ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1176                                   CallingConv::ID CallConv, bool isVarArg,
1177                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1178                                   DebugLoc dl, SelectionDAG &DAG,
1179                                   SmallVectorImpl<SDValue> &InVals) const {
1180
1181  // Assign locations to each value returned by this call.
1182  SmallVector<CCValAssign, 16> RVLocs;
1183  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1184                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
1185  CCInfo.AnalyzeCallResult(Ins,
1186                           CCAssignFnForNode(CallConv, /* Return*/ true,
1187                                             isVarArg));
1188
1189  // Copy all of the result registers out of their specified physreg.
1190  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1191    CCValAssign VA = RVLocs[i];
1192
1193    SDValue Val;
1194    if (VA.needsCustom()) {
1195      // Handle f64 or half of a v2f64.
1196      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1197                                      InFlag);
1198      Chain = Lo.getValue(1);
1199      InFlag = Lo.getValue(2);
1200      VA = RVLocs[++i]; // skip ahead to next loc
1201      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1202                                      InFlag);
1203      Chain = Hi.getValue(1);
1204      InFlag = Hi.getValue(2);
1205      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1206
1207      if (VA.getLocVT() == MVT::v2f64) {
1208        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1209        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1210                          DAG.getConstant(0, MVT::i32));
1211
1212        VA = RVLocs[++i]; // skip ahead to next loc
1213        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1214        Chain = Lo.getValue(1);
1215        InFlag = Lo.getValue(2);
1216        VA = RVLocs[++i]; // skip ahead to next loc
1217        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1218        Chain = Hi.getValue(1);
1219        InFlag = Hi.getValue(2);
1220        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1221        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1222                          DAG.getConstant(1, MVT::i32));
1223      }
1224    } else {
1225      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1226                               InFlag);
1227      Chain = Val.getValue(1);
1228      InFlag = Val.getValue(2);
1229    }
1230
1231    switch (VA.getLocInfo()) {
1232    default: llvm_unreachable("Unknown loc info!");
1233    case CCValAssign::Full: break;
1234    case CCValAssign::BCvt:
1235      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1236      break;
1237    }
1238
1239    InVals.push_back(Val);
1240  }
1241
1242  return Chain;
1243}
1244
1245/// LowerMemOpCallTo - Store the argument to the stack.
1246SDValue
1247ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1248                                    SDValue StackPtr, SDValue Arg,
1249                                    DebugLoc dl, SelectionDAG &DAG,
1250                                    const CCValAssign &VA,
1251                                    ISD::ArgFlagsTy Flags) const {
1252  unsigned LocMemOffset = VA.getLocMemOffset();
1253  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1254  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1255  return DAG.getStore(Chain, dl, Arg, PtrOff,
1256                      MachinePointerInfo::getStack(LocMemOffset),
1257                      false, false, 0);
1258}
1259
1260void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
1261                                         SDValue Chain, SDValue &Arg,
1262                                         RegsToPassVector &RegsToPass,
1263                                         CCValAssign &VA, CCValAssign &NextVA,
1264                                         SDValue &StackPtr,
1265                                         SmallVector<SDValue, 8> &MemOpChains,
1266                                         ISD::ArgFlagsTy Flags) const {
1267
1268  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1269                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
1270  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
1271
1272  if (NextVA.isRegLoc())
1273    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
1274  else {
1275    assert(NextVA.isMemLoc());
1276    if (StackPtr.getNode() == 0)
1277      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1278
1279    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
1280                                           dl, DAG, NextVA,
1281                                           Flags));
1282  }
1283}
1284
1285/// LowerCall - Lowering a call into a callseq_start <-
1286/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1287/// nodes.
1288SDValue
1289ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1290                             SmallVectorImpl<SDValue> &InVals) const {
1291  SelectionDAG &DAG                     = CLI.DAG;
1292  DebugLoc &dl                          = CLI.DL;
1293  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
1294  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
1295  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
1296  SDValue Chain                         = CLI.Chain;
1297  SDValue Callee                        = CLI.Callee;
1298  bool &isTailCall                      = CLI.IsTailCall;
1299  CallingConv::ID CallConv              = CLI.CallConv;
1300  bool doesNotRet                       = CLI.DoesNotReturn;
1301  bool isVarArg                         = CLI.IsVarArg;
1302
1303  MachineFunction &MF = DAG.getMachineFunction();
1304  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1305  bool IsSibCall = false;
1306  // Disable tail calls if they're not supported.
1307  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
1308    isTailCall = false;
1309  if (isTailCall) {
1310    // Check if it's really possible to do a tail call.
1311    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1312                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1313                                                   Outs, OutVals, Ins, DAG);
1314    // We don't support GuaranteedTailCallOpt for ARM, only automatically
1315    // detected sibcalls.
1316    if (isTailCall) {
1317      ++NumTailCalls;
1318      IsSibCall = true;
1319    }
1320  }
1321
1322  // Analyze operands of the call, assigning locations to each operand.
1323  SmallVector<CCValAssign, 16> ArgLocs;
1324  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1325                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
1326  CCInfo.AnalyzeCallOperands(Outs,
1327                             CCAssignFnForNode(CallConv, /* Return*/ false,
1328                                               isVarArg));
1329
1330  // Get a count of how many bytes are to be pushed on the stack.
1331  unsigned NumBytes = CCInfo.getNextStackOffset();
1332
1333  // For tail calls, memory operands are available in our caller's stack.
1334  if (IsSibCall)
1335    NumBytes = 0;
1336
1337  // Adjust the stack pointer for the new arguments...
1338  // These operations are automatically eliminated by the prolog/epilog pass
1339  if (!IsSibCall)
1340    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1341
1342  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1343
1344  RegsToPassVector RegsToPass;
1345  SmallVector<SDValue, 8> MemOpChains;
1346
1347  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1348  // of tail call optimization, arguments are handled later.
1349  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1350       i != e;
1351       ++i, ++realArgIdx) {
1352    CCValAssign &VA = ArgLocs[i];
1353    SDValue Arg = OutVals[realArgIdx];
1354    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1355    bool isByVal = Flags.isByVal();
1356
1357    // Promote the value if needed.
1358    switch (VA.getLocInfo()) {
1359    default: llvm_unreachable("Unknown loc info!");
1360    case CCValAssign::Full: break;
1361    case CCValAssign::SExt:
1362      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1363      break;
1364    case CCValAssign::ZExt:
1365      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1366      break;
1367    case CCValAssign::AExt:
1368      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1369      break;
1370    case CCValAssign::BCvt:
1371      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1372      break;
1373    }
1374
1375    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1376    if (VA.needsCustom()) {
1377      if (VA.getLocVT() == MVT::v2f64) {
1378        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1379                                  DAG.getConstant(0, MVT::i32));
1380        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1381                                  DAG.getConstant(1, MVT::i32));
1382
1383        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1384                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1385
1386        VA = ArgLocs[++i]; // skip ahead to next loc
1387        if (VA.isRegLoc()) {
1388          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1389                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1390        } else {
1391          assert(VA.isMemLoc());
1392
1393          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1394                                                 dl, DAG, VA, Flags));
1395        }
1396      } else {
1397        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1398                         StackPtr, MemOpChains, Flags);
1399      }
1400    } else if (VA.isRegLoc()) {
1401      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1402    } else if (isByVal) {
1403      assert(VA.isMemLoc());
1404      unsigned offset = 0;
1405
1406      // True if this byval aggregate will be split between registers
1407      // and memory.
1408      if (CCInfo.isFirstByValRegValid()) {
1409        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1410        unsigned int i, j;
1411        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
1412          SDValue Const = DAG.getConstant(4*i, MVT::i32);
1413          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
1414          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
1415                                     MachinePointerInfo(),
1416                                     false, false, false, 0);
1417          MemOpChains.push_back(Load.getValue(1));
1418          RegsToPass.push_back(std::make_pair(j, Load));
1419        }
1420        offset = ARM::R4 - CCInfo.getFirstByValReg();
1421        CCInfo.clearFirstByValReg();
1422      }
1423
1424      if (Flags.getByValSize() - 4*offset > 0) {
1425        unsigned LocMemOffset = VA.getLocMemOffset();
1426        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
1427        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
1428                                  StkPtrOff);
1429        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
1430        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
1431        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
1432                                           MVT::i32);
1433        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
1434
1435        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
1436        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
1437        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
1438                                          Ops, array_lengthof(Ops)));
1439      }
1440    } else if (!IsSibCall) {
1441      assert(VA.isMemLoc());
1442
1443      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1444                                             dl, DAG, VA, Flags));
1445    }
1446  }
1447
1448  if (!MemOpChains.empty())
1449    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1450                        &MemOpChains[0], MemOpChains.size());
1451
1452  // Build a sequence of copy-to-reg nodes chained together with token chain
1453  // and flag operands which copy the outgoing args into the appropriate regs.
1454  SDValue InFlag;
1455  // Tail call byval lowering might overwrite argument registers so in case of
1456  // tail call optimization the copies to registers are lowered later.
1457  if (!isTailCall)
1458    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1459      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1460                               RegsToPass[i].second, InFlag);
1461      InFlag = Chain.getValue(1);
1462    }
1463
1464  // For tail calls lower the arguments to the 'real' stack slot.
1465  if (isTailCall) {
1466    // Force all the incoming stack arguments to be loaded from the stack
1467    // before any new outgoing arguments are stored to the stack, because the
1468    // outgoing stack slots may alias the incoming argument stack slots, and
1469    // the alias isn't otherwise explicit. This is slightly more conservative
1470    // than necessary, because it means that each store effectively depends
1471    // on every argument instead of just those arguments it would clobber.
1472
1473    // Do not flag preceding copytoreg stuff together with the following stuff.
1474    InFlag = SDValue();
1475    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1476      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1477                               RegsToPass[i].second, InFlag);
1478      InFlag = Chain.getValue(1);
1479    }
1480    InFlag =SDValue();
1481  }
1482
1483  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1484  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1485  // node so that legalize doesn't hack it.
1486  bool isDirect = false;
1487  bool isARMFunc = false;
1488  bool isLocalARMFunc = false;
1489  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1490
1491  if (EnableARMLongCalls) {
1492    assert (getTargetMachine().getRelocationModel() == Reloc::Static
1493            && "long-calls with non-static relocation model!");
1494    // Handle a global address or an external symbol. If it's not one of
1495    // those, the target's already in a register, so we don't need to do
1496    // anything extra.
1497    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1498      const GlobalValue *GV = G->getGlobal();
1499      // Create a constant pool entry for the callee address
1500      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1501      ARMConstantPoolValue *CPV =
1502        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
1503
1504      // Get the address of the callee into a register
1505      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1506      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1507      Callee = DAG.getLoad(getPointerTy(), dl,
1508                           DAG.getEntryNode(), CPAddr,
1509                           MachinePointerInfo::getConstantPool(),
1510                           false, false, false, 0);
1511    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1512      const char *Sym = S->getSymbol();
1513
1514      // Create a constant pool entry for the callee address
1515      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1516      ARMConstantPoolValue *CPV =
1517        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1518                                      ARMPCLabelIndex, 0);
1519      // Get the address of the callee into a register
1520      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1521      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1522      Callee = DAG.getLoad(getPointerTy(), dl,
1523                           DAG.getEntryNode(), CPAddr,
1524                           MachinePointerInfo::getConstantPool(),
1525                           false, false, false, 0);
1526    }
1527  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1528    const GlobalValue *GV = G->getGlobal();
1529    isDirect = true;
1530    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
1531    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
1532                   getTargetMachine().getRelocationModel() != Reloc::Static;
1533    isARMFunc = !Subtarget->isThumb() || isStub;
1534    // ARM call to a local ARM function is predicable.
1535    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
1536    // tBX takes a register source operand.
1537    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1538      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1539      ARMConstantPoolValue *CPV =
1540        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
1541      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1542      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1543      Callee = DAG.getLoad(getPointerTy(), dl,
1544                           DAG.getEntryNode(), CPAddr,
1545                           MachinePointerInfo::getConstantPool(),
1546                           false, false, false, 0);
1547      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1548      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1549                           getPointerTy(), Callee, PICLabel);
1550    } else {
1551      // On ELF targets for PIC code, direct calls should go through the PLT
1552      unsigned OpFlags = 0;
1553      if (Subtarget->isTargetELF() &&
1554                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
1555        OpFlags = ARMII::MO_PLT;
1556      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
1557    }
1558  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1559    isDirect = true;
1560    bool isStub = Subtarget->isTargetDarwin() &&
1561                  getTargetMachine().getRelocationModel() != Reloc::Static;
1562    isARMFunc = !Subtarget->isThumb() || isStub;
1563    // tBX takes a register source operand.
1564    const char *Sym = S->getSymbol();
1565    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1566      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1567      ARMConstantPoolValue *CPV =
1568        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
1569                                      ARMPCLabelIndex, 4);
1570      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1571      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1572      Callee = DAG.getLoad(getPointerTy(), dl,
1573                           DAG.getEntryNode(), CPAddr,
1574                           MachinePointerInfo::getConstantPool(),
1575                           false, false, false, 0);
1576      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1577      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1578                           getPointerTy(), Callee, PICLabel);
1579    } else {
1580      unsigned OpFlags = 0;
1581      // On ELF targets for PIC code, direct calls should go through the PLT
1582      if (Subtarget->isTargetELF() &&
1583                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
1584        OpFlags = ARMII::MO_PLT;
1585      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
1586    }
1587  }
1588
1589  // FIXME: handle tail calls differently.
1590  unsigned CallOpc;
1591  if (Subtarget->isThumb()) {
1592    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1593      CallOpc = ARMISD::CALL_NOLINK;
1594    else if (doesNotRet && isDirect && !isARMFunc &&
1595             Subtarget->hasRAS() && !Subtarget->isThumb1Only())
1596      // "mov lr, pc; b _foo" to avoid confusing the RSP
1597      CallOpc = ARMISD::CALL_NOLINK;
1598    else
1599      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
1600  } else {
1601    if (!isDirect && !Subtarget->hasV5TOps()) {
1602      CallOpc = ARMISD::CALL_NOLINK;
1603    } else if (doesNotRet && isDirect && Subtarget->hasRAS())
1604      // "mov lr, pc; b _foo" to avoid confusing the RSP
1605      CallOpc = ARMISD::CALL_NOLINK;
1606    else
1607      CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
1608  }
1609
1610  std::vector<SDValue> Ops;
1611  Ops.push_back(Chain);
1612  Ops.push_back(Callee);
1613
1614  // Add argument registers to the end of the list so that they are known live
1615  // into the call.
1616  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1617    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1618                                  RegsToPass[i].second.getValueType()));
1619
1620  // Add a register mask operand representing the call-preserved registers.
1621  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
1622  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
1623  assert(Mask && "Missing call preserved mask for calling convention");
1624  Ops.push_back(DAG.getRegisterMask(Mask));
1625
1626  if (InFlag.getNode())
1627    Ops.push_back(InFlag);
1628
1629  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1630  if (isTailCall)
1631    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1632
1633  // Returns a chain and a flag for retval copy to use.
1634  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
1635  InFlag = Chain.getValue(1);
1636
1637  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1638                             DAG.getIntPtrConstant(0, true), InFlag);
1639  if (!Ins.empty())
1640    InFlag = Chain.getValue(1);
1641
1642  // Handle result values, copying them out of physregs into vregs that we
1643  // return.
1644  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
1645                         dl, DAG, InVals);
1646}
1647
1648/// HandleByVal - Every parameter *after* a byval parameter is passed
1649/// on the stack.  Remember the next parameter register to allocate,
1650/// and then confiscate the rest of the parameter registers to insure
1651/// this.
1652void
1653ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
1654  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
1655  assert((State->getCallOrPrologue() == Prologue ||
1656          State->getCallOrPrologue() == Call) &&
1657         "unhandled ParmContext");
1658  if ((!State->isFirstByValRegValid()) &&
1659      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
1660    State->setFirstByValReg(reg);
1661    // At a call site, a byval parameter that is split between
1662    // registers and memory needs its size truncated here.  In a
1663    // function prologue, such byval parameters are reassembled in
1664    // memory, and are not truncated.
1665    if (State->getCallOrPrologue() == Call) {
1666      unsigned excess = 4 * (ARM::R4 - reg);
1667      assert(size >= excess && "expected larger existing stack allocation");
1668      size -= excess;
1669    }
1670  }
1671  // Confiscate any remaining parameter registers to preclude their
1672  // assignment to subsequent parameters.
1673  while (State->AllocateReg(GPRArgRegs, 4))
1674    ;
1675}
1676
1677/// MatchingStackOffset - Return true if the given stack call argument is
1678/// already available in the same position (relatively) of the caller's
1679/// incoming argument stack.
1680static
1681bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
1682                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
1683                         const TargetInstrInfo *TII) {
1684  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
1685  int FI = INT_MAX;
1686  if (Arg.getOpcode() == ISD::CopyFromReg) {
1687    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
1688    if (!TargetRegisterInfo::isVirtualRegister(VR))
1689      return false;
1690    MachineInstr *Def = MRI->getVRegDef(VR);
1691    if (!Def)
1692      return false;
1693    if (!Flags.isByVal()) {
1694      if (!TII->isLoadFromStackSlot(Def, FI))
1695        return false;
1696    } else {
1697      return false;
1698    }
1699  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
1700    if (Flags.isByVal())
1701      // ByVal argument is passed in as a pointer but it's now being
1702      // dereferenced. e.g.
1703      // define @foo(%struct.X* %A) {
1704      //   tail call @bar(%struct.X* byval %A)
1705      // }
1706      return false;
1707    SDValue Ptr = Ld->getBasePtr();
1708    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
1709    if (!FINode)
1710      return false;
1711    FI = FINode->getIndex();
1712  } else
1713    return false;
1714
1715  assert(FI != INT_MAX);
1716  if (!MFI->isFixedObjectIndex(FI))
1717    return false;
1718  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
1719}
1720
1721/// IsEligibleForTailCallOptimization - Check whether the call is eligible
1722/// for tail call optimization. Targets which want to do tail call
1723/// optimization should implement this function.
1724bool
1725ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1726                                                     CallingConv::ID CalleeCC,
1727                                                     bool isVarArg,
1728                                                     bool isCalleeStructRet,
1729                                                     bool isCallerStructRet,
1730                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1731                                    const SmallVectorImpl<SDValue> &OutVals,
1732                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1733                                                     SelectionDAG& DAG) const {
1734  const Function *CallerF = DAG.getMachineFunction().getFunction();
1735  CallingConv::ID CallerCC = CallerF->getCallingConv();
1736  bool CCMatch = CallerCC == CalleeCC;
1737
1738  // Look for obvious safe cases to perform tail call optimization that do not
1739  // require ABI changes. This is what gcc calls sibcall.
1740
1741  // Do not sibcall optimize vararg calls unless the call site is not passing
1742  // any arguments.
1743  if (isVarArg && !Outs.empty())
1744    return false;
1745
1746  // Also avoid sibcall optimization if either caller or callee uses struct
1747  // return semantics.
1748  if (isCalleeStructRet || isCallerStructRet)
1749    return false;
1750
1751  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
1752  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
1753  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
1754  // support in the assembler and linker to be used. This would need to be
1755  // fixed to fully support tail calls in Thumb1.
1756  //
1757  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
1758  // LR.  This means if we need to reload LR, it takes an extra instructions,
1759  // which outweighs the value of the tail call; but here we don't know yet
1760  // whether LR is going to be used.  Probably the right approach is to
1761  // generate the tail call here and turn it back into CALL/RET in
1762  // emitEpilogue if LR is used.
1763
1764  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
1765  // but we need to make sure there are enough registers; the only valid
1766  // registers are the 4 used for parameters.  We don't currently do this
1767  // case.
1768  if (Subtarget->isThumb1Only())
1769    return false;
1770
1771  // If the calling conventions do not match, then we'd better make sure the
1772  // results are returned in the same way as what the caller expects.
1773  if (!CCMatch) {
1774    SmallVector<CCValAssign, 16> RVLocs1;
1775    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
1776                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
1777    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
1778
1779    SmallVector<CCValAssign, 16> RVLocs2;
1780    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
1781                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
1782    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
1783
1784    if (RVLocs1.size() != RVLocs2.size())
1785      return false;
1786    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1787      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1788        return false;
1789      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1790        return false;
1791      if (RVLocs1[i].isRegLoc()) {
1792        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1793          return false;
1794      } else {
1795        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1796          return false;
1797      }
1798    }
1799  }
1800
1801  // If the callee takes no arguments then go on to check the results of the
1802  // call.
1803  if (!Outs.empty()) {
1804    // Check if stack adjustment is needed. For now, do not do this if any
1805    // argument is passed on the stack.
1806    SmallVector<CCValAssign, 16> ArgLocs;
1807    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
1808                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
1809    CCInfo.AnalyzeCallOperands(Outs,
1810                               CCAssignFnForNode(CalleeCC, false, isVarArg));
1811    if (CCInfo.getNextStackOffset()) {
1812      MachineFunction &MF = DAG.getMachineFunction();
1813
1814      // Check if the arguments are already laid out in the right way as
1815      // the caller's fixed stack objects.
1816      MachineFrameInfo *MFI = MF.getFrameInfo();
1817      const MachineRegisterInfo *MRI = &MF.getRegInfo();
1818      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
1819      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1820           i != e;
1821           ++i, ++realArgIdx) {
1822        CCValAssign &VA = ArgLocs[i];
1823        EVT RegVT = VA.getLocVT();
1824        SDValue Arg = OutVals[realArgIdx];
1825        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1826        if (VA.getLocInfo() == CCValAssign::Indirect)
1827          return false;
1828        if (VA.needsCustom()) {
1829          // f64 and vector types are split into multiple registers or
1830          // register/stack-slot combinations.  The types will not match
1831          // the registers; give up on memory f64 refs until we figure
1832          // out what to do about this.
1833          if (!VA.isRegLoc())
1834            return false;
1835          if (!ArgLocs[++i].isRegLoc())
1836            return false;
1837          if (RegVT == MVT::v2f64) {
1838            if (!ArgLocs[++i].isRegLoc())
1839              return false;
1840            if (!ArgLocs[++i].isRegLoc())
1841              return false;
1842          }
1843        } else if (!VA.isRegLoc()) {
1844          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
1845                                   MFI, MRI, TII))
1846            return false;
1847        }
1848      }
1849    }
1850  }
1851
1852  return true;
1853}
1854
1855SDValue
1856ARMTargetLowering::LowerReturn(SDValue Chain,
1857                               CallingConv::ID CallConv, bool isVarArg,
1858                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1859                               const SmallVectorImpl<SDValue> &OutVals,
1860                               DebugLoc dl, SelectionDAG &DAG) const {
1861
1862  // CCValAssign - represent the assignment of the return value to a location.
1863  SmallVector<CCValAssign, 16> RVLocs;
1864
1865  // CCState - Info about the registers and stack slots.
1866  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1867                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
1868
1869  // Analyze outgoing return values.
1870  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
1871                                               isVarArg));
1872
1873  // If this is the first return lowered for this function, add
1874  // the regs to the liveout set for the function.
1875  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1876    for (unsigned i = 0; i != RVLocs.size(); ++i)
1877      if (RVLocs[i].isRegLoc())
1878        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1879  }
1880
1881  SDValue Flag;
1882
1883  // Copy the result values into the output registers.
1884  for (unsigned i = 0, realRVLocIdx = 0;
1885       i != RVLocs.size();
1886       ++i, ++realRVLocIdx) {
1887    CCValAssign &VA = RVLocs[i];
1888    assert(VA.isRegLoc() && "Can only return in registers!");
1889
1890    SDValue Arg = OutVals[realRVLocIdx];
1891
1892    switch (VA.getLocInfo()) {
1893    default: llvm_unreachable("Unknown loc info!");
1894    case CCValAssign::Full: break;
1895    case CCValAssign::BCvt:
1896      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1897      break;
1898    }
1899
1900    if (VA.needsCustom()) {
1901      if (VA.getLocVT() == MVT::v2f64) {
1902        // Extract the first half and return it in two registers.
1903        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1904                                   DAG.getConstant(0, MVT::i32));
1905        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
1906                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
1907
1908        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
1909        Flag = Chain.getValue(1);
1910        VA = RVLocs[++i]; // skip ahead to next loc
1911        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1912                                 HalfGPRs.getValue(1), Flag);
1913        Flag = Chain.getValue(1);
1914        VA = RVLocs[++i]; // skip ahead to next loc
1915
1916        // Extract the 2nd half and fall through to handle it as an f64 value.
1917        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1918                          DAG.getConstant(1, MVT::i32));
1919      }
1920      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
1921      // available.
1922      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1923                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
1924      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
1925      Flag = Chain.getValue(1);
1926      VA = RVLocs[++i]; // skip ahead to next loc
1927      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
1928                               Flag);
1929    } else
1930      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1931
1932    // Guarantee that all emitted copies are
1933    // stuck together, avoiding something bad.
1934    Flag = Chain.getValue(1);
1935  }
1936
1937  SDValue result;
1938  if (Flag.getNode())
1939    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1940  else // Return Void
1941    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
1942
1943  return result;
1944}
1945
1946bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
1947  if (N->getNumValues() != 1)
1948    return false;
1949  if (!N->hasNUsesOfValue(1, 0))
1950    return false;
1951
1952  SDValue TCChain = Chain;
1953  SDNode *Copy = *N->use_begin();
1954  if (Copy->getOpcode() == ISD::CopyToReg) {
1955    // If the copy has a glue operand, we conservatively assume it isn't safe to
1956    // perform a tail call.
1957    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
1958      return false;
1959    TCChain = Copy->getOperand(0);
1960  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
1961    SDNode *VMov = Copy;
1962    // f64 returned in a pair of GPRs.
1963    SmallPtrSet<SDNode*, 2> Copies;
1964    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
1965         UI != UE; ++UI) {
1966      if (UI->getOpcode() != ISD::CopyToReg)
1967        return false;
1968      Copies.insert(*UI);
1969    }
1970    if (Copies.size() > 2)
1971      return false;
1972
1973    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
1974         UI != UE; ++UI) {
1975      SDValue UseChain = UI->getOperand(0);
1976      if (Copies.count(UseChain.getNode()))
1977        // Second CopyToReg
1978        Copy = *UI;
1979      else
1980        // First CopyToReg
1981        TCChain = UseChain;
1982    }
1983  } else if (Copy->getOpcode() == ISD::BITCAST) {
1984    // f32 returned in a single GPR.
1985    if (!Copy->hasOneUse())
1986      return false;
1987    Copy = *Copy->use_begin();
1988    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
1989      return false;
1990    Chain = Copy->getOperand(0);
1991  } else {
1992    return false;
1993  }
1994
1995  bool HasRet = false;
1996  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1997       UI != UE; ++UI) {
1998    if (UI->getOpcode() != ARMISD::RET_FLAG)
1999      return false;
2000    HasRet = true;
2001  }
2002
2003  if (!HasRet)
2004    return false;
2005
2006  Chain = TCChain;
2007  return true;
2008}
2009
2010bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2011  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
2012    return false;
2013
2014  if (!CI->isTailCall())
2015    return false;
2016
2017  return !Subtarget->isThumb1Only();
2018}
2019
2020// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2021// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2022// one of the above mentioned nodes. It has to be wrapped because otherwise
2023// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2024// be used to form addressing mode. These wrapped nodes will be selected
2025// into MOVi.
2026static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
2027  EVT PtrVT = Op.getValueType();
2028  // FIXME there is no actual debug info here
2029  DebugLoc dl = Op.getDebugLoc();
2030  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2031  SDValue Res;
2032  if (CP->isMachineConstantPoolEntry())
2033    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2034                                    CP->getAlignment());
2035  else
2036    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2037                                    CP->getAlignment());
2038  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2039}
2040
2041unsigned ARMTargetLowering::getJumpTableEncoding() const {
2042  return MachineJumpTableInfo::EK_Inline;
2043}
2044
2045SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2046                                             SelectionDAG &DAG) const {
2047  MachineFunction &MF = DAG.getMachineFunction();
2048  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2049  unsigned ARMPCLabelIndex = 0;
2050  DebugLoc DL = Op.getDebugLoc();
2051  EVT PtrVT = getPointerTy();
2052  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2053  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2054  SDValue CPAddr;
2055  if (RelocM == Reloc::Static) {
2056    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2057  } else {
2058    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2059    ARMPCLabelIndex = AFI->createPICLabelUId();
2060    ARMConstantPoolValue *CPV =
2061      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2062                                      ARMCP::CPBlockAddress, PCAdj);
2063    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2064  }
2065  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2066  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
2067                               MachinePointerInfo::getConstantPool(),
2068                               false, false, false, 0);
2069  if (RelocM == Reloc::Static)
2070    return Result;
2071  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2072  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2073}
2074
2075// Lower ISD::GlobalTLSAddress using the "general dynamic" model
2076SDValue
2077ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
2078                                                 SelectionDAG &DAG) const {
2079  DebugLoc dl = GA->getDebugLoc();
2080  EVT PtrVT = getPointerTy();
2081  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2082  MachineFunction &MF = DAG.getMachineFunction();
2083  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2084  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2085  ARMConstantPoolValue *CPV =
2086    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2087                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
2088  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2089  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
2090  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
2091                         MachinePointerInfo::getConstantPool(),
2092                         false, false, false, 0);
2093  SDValue Chain = Argument.getValue(1);
2094
2095  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2096  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
2097
2098  // call __tls_get_addr.
2099  ArgListTy Args;
2100  ArgListEntry Entry;
2101  Entry.Node = Argument;
2102  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
2103  Args.push_back(Entry);
2104  // FIXME: is there useful debug info available here?
2105  TargetLowering::CallLoweringInfo CLI(Chain,
2106                (Type *) Type::getInt32Ty(*DAG.getContext()),
2107                false, false, false, false,
2108                0, CallingConv::C, /*isTailCall=*/false,
2109                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
2110                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
2111  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2112  return CallResult.first;
2113}
2114
2115// Lower ISD::GlobalTLSAddress using the "initial exec" or
2116// "local exec" model.
2117SDValue
2118ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
2119                                        SelectionDAG &DAG,
2120                                        TLSModel::Model model) const {
2121  const GlobalValue *GV = GA->getGlobal();
2122  DebugLoc dl = GA->getDebugLoc();
2123  SDValue Offset;
2124  SDValue Chain = DAG.getEntryNode();
2125  EVT PtrVT = getPointerTy();
2126  // Get the Thread Pointer
2127  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2128
2129  if (model == TLSModel::InitialExec) {
2130    MachineFunction &MF = DAG.getMachineFunction();
2131    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2132    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2133    // Initial exec model.
2134    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
2135    ARMConstantPoolValue *CPV =
2136      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
2137                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
2138                                      true);
2139    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2140    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2141    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2142                         MachinePointerInfo::getConstantPool(),
2143                         false, false, false, 0);
2144    Chain = Offset.getValue(1);
2145
2146    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2147    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
2148
2149    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2150                         MachinePointerInfo::getConstantPool(),
2151                         false, false, false, 0);
2152  } else {
2153    // local exec model
2154    assert(model == TLSModel::LocalExec);
2155    ARMConstantPoolValue *CPV =
2156      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
2157    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2158    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
2159    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
2160                         MachinePointerInfo::getConstantPool(),
2161                         false, false, false, 0);
2162  }
2163
2164  // The address of the thread local variable is the add of the thread
2165  // pointer with the offset of the variable.
2166  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
2167}
2168
2169SDValue
2170ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
2171  // TODO: implement the "local dynamic" model
2172  assert(Subtarget->isTargetELF() &&
2173         "TLS not implemented for non-ELF targets");
2174  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
2175
2176  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
2177
2178  switch (model) {
2179    case TLSModel::GeneralDynamic:
2180    case TLSModel::LocalDynamic:
2181      return LowerToTLSGeneralDynamicModel(GA, DAG);
2182    case TLSModel::InitialExec:
2183    case TLSModel::LocalExec:
2184      return LowerToTLSExecModels(GA, DAG, model);
2185  }
2186  llvm_unreachable("bogus TLS model");
2187}
2188
2189SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
2190                                                 SelectionDAG &DAG) const {
2191  EVT PtrVT = getPointerTy();
2192  DebugLoc dl = Op.getDebugLoc();
2193  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2194  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2195  if (RelocM == Reloc::PIC_) {
2196    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
2197    ARMConstantPoolValue *CPV =
2198      ARMConstantPoolConstant::Create(GV,
2199                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
2200    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2201    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2202    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
2203                                 CPAddr,
2204                                 MachinePointerInfo::getConstantPool(),
2205                                 false, false, false, 0);
2206    SDValue Chain = Result.getValue(1);
2207    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2208    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
2209    if (!UseGOTOFF)
2210      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2211                           MachinePointerInfo::getGOT(),
2212                           false, false, false, 0);
2213    return Result;
2214  }
2215
2216  // If we have T2 ops, we can materialize the address directly via movt/movw
2217  // pair. This is always cheaper.
2218  if (Subtarget->useMovt()) {
2219    ++NumMovwMovt;
2220    // FIXME: Once remat is capable of dealing with instructions with register
2221    // operands, expand this into two nodes.
2222    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2223                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2224  } else {
2225    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2226    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2227    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2228                       MachinePointerInfo::getConstantPool(),
2229                       false, false, false, 0);
2230  }
2231}
2232
2233SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2234                                                    SelectionDAG &DAG) const {
2235  EVT PtrVT = getPointerTy();
2236  DebugLoc dl = Op.getDebugLoc();
2237  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2238  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2239  MachineFunction &MF = DAG.getMachineFunction();
2240  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2241
2242  // FIXME: Enable this for static codegen when tool issues are fixed.  Also
2243  // update ARMFastISel::ARMMaterializeGV.
2244  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
2245    ++NumMovwMovt;
2246    // FIXME: Once remat is capable of dealing with instructions with register
2247    // operands, expand this into two nodes.
2248    if (RelocM == Reloc::Static)
2249      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2250                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2251
2252    unsigned Wrapper = (RelocM == Reloc::PIC_)
2253      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
2254    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
2255                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2256    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2257      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2258                           MachinePointerInfo::getGOT(),
2259                           false, false, false, 0);
2260    return Result;
2261  }
2262
2263  unsigned ARMPCLabelIndex = 0;
2264  SDValue CPAddr;
2265  if (RelocM == Reloc::Static) {
2266    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2267  } else {
2268    ARMPCLabelIndex = AFI->createPICLabelUId();
2269    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
2270    ARMConstantPoolValue *CPV =
2271      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
2272                                      PCAdj);
2273    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2274  }
2275  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2276
2277  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2278                               MachinePointerInfo::getConstantPool(),
2279                               false, false, false, 0);
2280  SDValue Chain = Result.getValue(1);
2281
2282  if (RelocM == Reloc::PIC_) {
2283    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2284    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2285  }
2286
2287  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2288    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
2289                         false, false, false, 0);
2290
2291  return Result;
2292}
2293
2294SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
2295                                                    SelectionDAG &DAG) const {
2296  assert(Subtarget->isTargetELF() &&
2297         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
2298  MachineFunction &MF = DAG.getMachineFunction();
2299  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2300  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2301  EVT PtrVT = getPointerTy();
2302  DebugLoc dl = Op.getDebugLoc();
2303  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2304  ARMConstantPoolValue *CPV =
2305    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
2306                                  ARMPCLabelIndex, PCAdj);
2307  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2308  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2309  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2310                               MachinePointerInfo::getConstantPool(),
2311                               false, false, false, 0);
2312  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2313  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2314}
2315
2316SDValue
2317ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2318  DebugLoc dl = Op.getDebugLoc();
2319  SDValue Val = DAG.getConstant(0, MVT::i32);
2320  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
2321                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
2322                     Op.getOperand(1), Val);
2323}
2324
2325SDValue
2326ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2327  DebugLoc dl = Op.getDebugLoc();
2328  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2329                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
2330}
2331
2332SDValue
2333ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2334                                          const ARMSubtarget *Subtarget) const {
2335  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2336  DebugLoc dl = Op.getDebugLoc();
2337  switch (IntNo) {
2338  default: return SDValue();    // Don't custom lower most intrinsics.
2339  case Intrinsic::arm_thread_pointer: {
2340    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2341    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2342  }
2343  case Intrinsic::eh_sjlj_lsda: {
2344    MachineFunction &MF = DAG.getMachineFunction();
2345    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2346    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2347    EVT PtrVT = getPointerTy();
2348    DebugLoc dl = Op.getDebugLoc();
2349    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2350    SDValue CPAddr;
2351    unsigned PCAdj = (RelocM != Reloc::PIC_)
2352      ? 0 : (Subtarget->isThumb() ? 4 : 8);
2353    ARMConstantPoolValue *CPV =
2354      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
2355                                      ARMCP::CPLSDA, PCAdj);
2356    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2357    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2358    SDValue Result =
2359      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2360                  MachinePointerInfo::getConstantPool(),
2361                  false, false, false, 0);
2362
2363    if (RelocM == Reloc::PIC_) {
2364      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2365      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2366    }
2367    return Result;
2368  }
2369  case Intrinsic::arm_neon_vmulls:
2370  case Intrinsic::arm_neon_vmullu: {
2371    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2372      ? ARMISD::VMULLs : ARMISD::VMULLu;
2373    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
2374                       Op.getOperand(1), Op.getOperand(2));
2375  }
2376  }
2377}
2378
2379static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
2380                               const ARMSubtarget *Subtarget) {
2381  DebugLoc dl = Op.getDebugLoc();
2382  if (!Subtarget->hasDataBarrier()) {
2383    // Some ARMv6 cpus can support data barriers with an mcr instruction.
2384    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2385    // here.
2386    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2387           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
2388    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2389                       DAG.getConstant(0, MVT::i32));
2390  }
2391
2392  SDValue Op5 = Op.getOperand(5);
2393  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
2394  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2395  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2396  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
2397
2398  ARM_MB::MemBOpt DMBOpt;
2399  if (isDeviceBarrier)
2400    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
2401  else
2402    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
2403  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
2404                     DAG.getConstant(DMBOpt, MVT::i32));
2405}
2406
2407
2408static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
2409                                 const ARMSubtarget *Subtarget) {
2410  // FIXME: handle "fence singlethread" more efficiently.
2411  DebugLoc dl = Op.getDebugLoc();
2412  if (!Subtarget->hasDataBarrier()) {
2413    // Some ARMv6 cpus can support data barriers with an mcr instruction.
2414    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2415    // here.
2416    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2417           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
2418    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2419                       DAG.getConstant(0, MVT::i32));
2420  }
2421
2422  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
2423                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
2424}
2425
2426static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
2427                             const ARMSubtarget *Subtarget) {
2428  // ARM pre v5TE and Thumb1 does not have preload instructions.
2429  if (!(Subtarget->isThumb2() ||
2430        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
2431    // Just preserve the chain.
2432    return Op.getOperand(0);
2433
2434  DebugLoc dl = Op.getDebugLoc();
2435  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
2436  if (!isRead &&
2437      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
2438    // ARMv7 with MP extension has PLDW.
2439    return Op.getOperand(0);
2440
2441  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2442  if (Subtarget->isThumb()) {
2443    // Invert the bits.
2444    isRead = ~isRead & 1;
2445    isData = ~isData & 1;
2446  }
2447
2448  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
2449                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
2450                     DAG.getConstant(isData, MVT::i32));
2451}
2452
2453static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
2454  MachineFunction &MF = DAG.getMachineFunction();
2455  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
2456
2457  // vastart just stores the address of the VarArgsFrameIndex slot into the
2458  // memory location argument.
2459  DebugLoc dl = Op.getDebugLoc();
2460  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2461  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
2462  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2463  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
2464                      MachinePointerInfo(SV), false, false, 0);
2465}
2466
2467SDValue
2468ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
2469                                        SDValue &Root, SelectionDAG &DAG,
2470                                        DebugLoc dl) const {
2471  MachineFunction &MF = DAG.getMachineFunction();
2472  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2473
2474  const TargetRegisterClass *RC;
2475  if (AFI->isThumb1OnlyFunction())
2476    RC = &ARM::tGPRRegClass;
2477  else
2478    RC = &ARM::GPRRegClass;
2479
2480  // Transform the arguments stored in physical registers into virtual ones.
2481  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2482  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2483
2484  SDValue ArgValue2;
2485  if (NextVA.isMemLoc()) {
2486    MachineFrameInfo *MFI = MF.getFrameInfo();
2487    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
2488
2489    // Create load node to retrieve arguments from the stack.
2490    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2491    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
2492                            MachinePointerInfo::getFixedStack(FI),
2493                            false, false, false, 0);
2494  } else {
2495    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2496    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2497  }
2498
2499  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
2500}
2501
2502void
2503ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
2504                                  unsigned &VARegSize, unsigned &VARegSaveSize)
2505  const {
2506  unsigned NumGPRs;
2507  if (CCInfo.isFirstByValRegValid())
2508    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
2509  else {
2510    unsigned int firstUnalloced;
2511    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
2512                                                sizeof(GPRArgRegs) /
2513                                                sizeof(GPRArgRegs[0]));
2514    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
2515  }
2516
2517  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
2518  VARegSize = NumGPRs * 4;
2519  VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
2520}
2521
2522// The remaining GPRs hold either the beginning of variable-argument
2523// data, or the beginning of an aggregate passed by value (usuall
2524// byval).  Either way, we allocate stack slots adjacent to the data
2525// provided by our caller, and store the unallocated registers there.
2526// If this is a variadic function, the va_list pointer will begin with
2527// these values; otherwise, this reassembles a (byval) structure that
2528// was split between registers and memory.
2529void
2530ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
2531                                        DebugLoc dl, SDValue &Chain,
2532                                        unsigned ArgOffset) const {
2533  MachineFunction &MF = DAG.getMachineFunction();
2534  MachineFrameInfo *MFI = MF.getFrameInfo();
2535  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2536  unsigned firstRegToSaveIndex;
2537  if (CCInfo.isFirstByValRegValid())
2538    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
2539  else {
2540    firstRegToSaveIndex = CCInfo.getFirstUnallocated
2541      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
2542  }
2543
2544  unsigned VARegSize, VARegSaveSize;
2545  computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
2546  if (VARegSaveSize) {
2547    // If this function is vararg, store any remaining integer argument regs
2548    // to their spots on the stack so that they may be loaded by deferencing
2549    // the result of va_next.
2550    AFI->setVarArgsRegSaveSize(VARegSaveSize);
2551    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize,
2552                                                     ArgOffset + VARegSaveSize
2553                                                     - VARegSize,
2554                                                     false));
2555    SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
2556                                    getPointerTy());
2557
2558    SmallVector<SDValue, 4> MemOps;
2559    for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
2560      const TargetRegisterClass *RC;
2561      if (AFI->isThumb1OnlyFunction())
2562        RC = &ARM::tGPRRegClass;
2563      else
2564        RC = &ARM::GPRRegClass;
2565
2566      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
2567      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
2568      SDValue Store =
2569        DAG.getStore(Val.getValue(1), dl, Val, FIN,
2570                 MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
2571                     false, false, 0);
2572      MemOps.push_back(Store);
2573      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
2574                        DAG.getConstant(4, getPointerTy()));
2575    }
2576    if (!MemOps.empty())
2577      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2578                          &MemOps[0], MemOps.size());
2579  } else
2580    // This will point to the next argument passed via stack.
2581    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
2582}
2583
2584SDValue
2585ARMTargetLowering::LowerFormalArguments(SDValue Chain,
2586                                        CallingConv::ID CallConv, bool isVarArg,
2587                                        const SmallVectorImpl<ISD::InputArg>
2588                                          &Ins,
2589                                        DebugLoc dl, SelectionDAG &DAG,
2590                                        SmallVectorImpl<SDValue> &InVals)
2591                                          const {
2592  MachineFunction &MF = DAG.getMachineFunction();
2593  MachineFrameInfo *MFI = MF.getFrameInfo();
2594
2595  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2596
2597  // Assign locations to all of the incoming arguments.
2598  SmallVector<CCValAssign, 16> ArgLocs;
2599  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
2600                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
2601  CCInfo.AnalyzeFormalArguments(Ins,
2602                                CCAssignFnForNode(CallConv, /* Return*/ false,
2603                                                  isVarArg));
2604
2605  SmallVector<SDValue, 16> ArgValues;
2606  int lastInsIndex = -1;
2607
2608  SDValue ArgValue;
2609  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2610    CCValAssign &VA = ArgLocs[i];
2611
2612    // Arguments stored in registers.
2613    if (VA.isRegLoc()) {
2614      EVT RegVT = VA.getLocVT();
2615
2616      if (VA.needsCustom()) {
2617        // f64 and vector types are split up into multiple registers or
2618        // combinations of registers and stack slots.
2619        if (VA.getLocVT() == MVT::v2f64) {
2620          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
2621                                                   Chain, DAG, dl);
2622          VA = ArgLocs[++i]; // skip ahead to next loc
2623          SDValue ArgValue2;
2624          if (VA.isMemLoc()) {
2625            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
2626            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2627            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
2628                                    MachinePointerInfo::getFixedStack(FI),
2629                                    false, false, false, 0);
2630          } else {
2631            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
2632                                             Chain, DAG, dl);
2633          }
2634          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2635          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2636                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
2637          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2638                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
2639        } else
2640          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
2641
2642      } else {
2643        const TargetRegisterClass *RC;
2644
2645        if (RegVT == MVT::f32)
2646          RC = &ARM::SPRRegClass;
2647        else if (RegVT == MVT::f64)
2648          RC = &ARM::DPRRegClass;
2649        else if (RegVT == MVT::v2f64)
2650          RC = &ARM::QPRRegClass;
2651        else if (RegVT == MVT::i32)
2652          RC = AFI->isThumb1OnlyFunction() ?
2653            (const TargetRegisterClass*)&ARM::tGPRRegClass :
2654            (const TargetRegisterClass*)&ARM::GPRRegClass;
2655        else
2656          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2657
2658        // Transform the arguments in physical registers into virtual ones.
2659        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2660        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2661      }
2662
2663      // If this is an 8 or 16-bit value, it is really passed promoted
2664      // to 32 bits.  Insert an assert[sz]ext to capture this, then
2665      // truncate to the right size.
2666      switch (VA.getLocInfo()) {
2667      default: llvm_unreachable("Unknown loc info!");
2668      case CCValAssign::Full: break;
2669      case CCValAssign::BCvt:
2670        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2671        break;
2672      case CCValAssign::SExt:
2673        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2674                               DAG.getValueType(VA.getValVT()));
2675        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2676        break;
2677      case CCValAssign::ZExt:
2678        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2679                               DAG.getValueType(VA.getValVT()));
2680        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2681        break;
2682      }
2683
2684      InVals.push_back(ArgValue);
2685
2686    } else { // VA.isRegLoc()
2687
2688      // sanity check
2689      assert(VA.isMemLoc());
2690      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
2691
2692      int index = ArgLocs[i].getValNo();
2693
2694      // Some Ins[] entries become multiple ArgLoc[] entries.
2695      // Process them only once.
2696      if (index != lastInsIndex)
2697        {
2698          ISD::ArgFlagsTy Flags = Ins[index].Flags;
2699          // FIXME: For now, all byval parameter objects are marked mutable.
2700          // This can be changed with more analysis.
2701          // In case of tail call optimization mark all arguments mutable.
2702          // Since they could be overwritten by lowering of arguments in case of
2703          // a tail call.
2704          if (Flags.isByVal()) {
2705            unsigned VARegSize, VARegSaveSize;
2706            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
2707            VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0);
2708            unsigned Bytes = Flags.getByValSize() - VARegSize;
2709            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2710            int FI = MFI->CreateFixedObject(Bytes,
2711                                            VA.getLocMemOffset(), false);
2712            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
2713          } else {
2714            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
2715                                            VA.getLocMemOffset(), true);
2716
2717            // Create load nodes to retrieve arguments from the stack.
2718            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2719            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
2720                                         MachinePointerInfo::getFixedStack(FI),
2721                                         false, false, false, 0));
2722          }
2723          lastInsIndex = index;
2724        }
2725    }
2726  }
2727
2728  // varargs
2729  if (isVarArg)
2730    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset());
2731
2732  return Chain;
2733}
2734
2735/// isFloatingPointZero - Return true if this is +0.0.
2736static bool isFloatingPointZero(SDValue Op) {
2737  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
2738    return CFP->getValueAPF().isPosZero();
2739  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
2740    // Maybe this has already been legalized into the constant pool?
2741    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
2742      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
2743      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
2744        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
2745          return CFP->getValueAPF().isPosZero();
2746    }
2747  }
2748  return false;
2749}
2750
2751/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
2752/// the given operands.
2753SDValue
2754ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2755                             SDValue &ARMcc, SelectionDAG &DAG,
2756                             DebugLoc dl) const {
2757  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2758    unsigned C = RHSC->getZExtValue();
2759    if (!isLegalICmpImmediate(C)) {
2760      // Constant does not fit, try adjusting it by one?
2761      switch (CC) {
2762      default: break;
2763      case ISD::SETLT:
2764      case ISD::SETGE:
2765        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
2766          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2767          RHS = DAG.getConstant(C-1, MVT::i32);
2768        }
2769        break;
2770      case ISD::SETULT:
2771      case ISD::SETUGE:
2772        if (C != 0 && isLegalICmpImmediate(C-1)) {
2773          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2774          RHS = DAG.getConstant(C-1, MVT::i32);
2775        }
2776        break;
2777      case ISD::SETLE:
2778      case ISD::SETGT:
2779        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
2780          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2781          RHS = DAG.getConstant(C+1, MVT::i32);
2782        }
2783        break;
2784      case ISD::SETULE:
2785      case ISD::SETUGT:
2786        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
2787          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2788          RHS = DAG.getConstant(C+1, MVT::i32);
2789        }
2790        break;
2791      }
2792    }
2793  }
2794
2795  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
2796  ARMISD::NodeType CompareType;
2797  switch (CondCode) {
2798  default:
2799    CompareType = ARMISD::CMP;
2800    break;
2801  case ARMCC::EQ:
2802  case ARMCC::NE:
2803    // Uses only Z Flag
2804    CompareType = ARMISD::CMPZ;
2805    break;
2806  }
2807  ARMcc = DAG.getConstant(CondCode, MVT::i32);
2808  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
2809}
2810
2811/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
2812SDValue
2813ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
2814                             DebugLoc dl) const {
2815  SDValue Cmp;
2816  if (!isFloatingPointZero(RHS))
2817    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
2818  else
2819    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
2820  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
2821}
2822
2823/// duplicateCmp - Glue values can have only one use, so this function
2824/// duplicates a comparison node.
2825SDValue
2826ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
2827  unsigned Opc = Cmp.getOpcode();
2828  DebugLoc DL = Cmp.getDebugLoc();
2829  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
2830    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
2831
2832  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
2833  Cmp = Cmp.getOperand(0);
2834  Opc = Cmp.getOpcode();
2835  if (Opc == ARMISD::CMPFP)
2836    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
2837  else {
2838    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
2839    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
2840  }
2841  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
2842}
2843
2844SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2845  SDValue Cond = Op.getOperand(0);
2846  SDValue SelectTrue = Op.getOperand(1);
2847  SDValue SelectFalse = Op.getOperand(2);
2848  DebugLoc dl = Op.getDebugLoc();
2849
2850  // Convert:
2851  //
2852  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
2853  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
2854  //
2855  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
2856    const ConstantSDNode *CMOVTrue =
2857      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
2858    const ConstantSDNode *CMOVFalse =
2859      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2860
2861    if (CMOVTrue && CMOVFalse) {
2862      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
2863      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
2864
2865      SDValue True;
2866      SDValue False;
2867      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
2868        True = SelectTrue;
2869        False = SelectFalse;
2870      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
2871        True = SelectFalse;
2872        False = SelectTrue;
2873      }
2874
2875      if (True.getNode() && False.getNode()) {
2876        EVT VT = Op.getValueType();
2877        SDValue ARMcc = Cond.getOperand(2);
2878        SDValue CCR = Cond.getOperand(3);
2879        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
2880        assert(True.getValueType() == VT);
2881        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
2882      }
2883    }
2884  }
2885
2886  // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
2887  // undefined bits before doing a full-word comparison with zero.
2888  Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
2889                     DAG.getConstant(1, Cond.getValueType()));
2890
2891  return DAG.getSelectCC(dl, Cond,
2892                         DAG.getConstant(0, Cond.getValueType()),
2893                         SelectTrue, SelectFalse, ISD::SETNE);
2894}
2895
2896SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
2897  EVT VT = Op.getValueType();
2898  SDValue LHS = Op.getOperand(0);
2899  SDValue RHS = Op.getOperand(1);
2900  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2901  SDValue TrueVal = Op.getOperand(2);
2902  SDValue FalseVal = Op.getOperand(3);
2903  DebugLoc dl = Op.getDebugLoc();
2904
2905  if (LHS.getValueType() == MVT::i32) {
2906    SDValue ARMcc;
2907    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2908    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
2909    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
2910  }
2911
2912  ARMCC::CondCodes CondCode, CondCode2;
2913  FPCCToARMCC(CC, CondCode, CondCode2);
2914
2915  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
2916  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
2917  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2918  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
2919                               ARMcc, CCR, Cmp);
2920  if (CondCode2 != ARMCC::AL) {
2921    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
2922    // FIXME: Needs another CMP because flag can have but one use.
2923    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
2924    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
2925                         Result, TrueVal, ARMcc2, CCR, Cmp2);
2926  }
2927  return Result;
2928}
2929
2930/// canChangeToInt - Given the fp compare operand, return true if it is suitable
2931/// to morph to an integer compare sequence.
2932static bool canChangeToInt(SDValue Op, bool &SeenZero,
2933                           const ARMSubtarget *Subtarget) {
2934  SDNode *N = Op.getNode();
2935  if (!N->hasOneUse())
2936    // Otherwise it requires moving the value from fp to integer registers.
2937    return false;
2938  if (!N->getNumValues())
2939    return false;
2940  EVT VT = Op.getValueType();
2941  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
2942    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
2943    // vmrs are very slow, e.g. cortex-a8.
2944    return false;
2945
2946  if (isFloatingPointZero(Op)) {
2947    SeenZero = true;
2948    return true;
2949  }
2950  return ISD::isNormalLoad(N);
2951}
2952
2953static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
2954  if (isFloatingPointZero(Op))
2955    return DAG.getConstant(0, MVT::i32);
2956
2957  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
2958    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2959                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
2960                       Ld->isVolatile(), Ld->isNonTemporal(),
2961                       Ld->isInvariant(), Ld->getAlignment());
2962
2963  llvm_unreachable("Unknown VFP cmp argument!");
2964}
2965
2966static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
2967                           SDValue &RetVal1, SDValue &RetVal2) {
2968  if (isFloatingPointZero(Op)) {
2969    RetVal1 = DAG.getConstant(0, MVT::i32);
2970    RetVal2 = DAG.getConstant(0, MVT::i32);
2971    return;
2972  }
2973
2974  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
2975    SDValue Ptr = Ld->getBasePtr();
2976    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2977                          Ld->getChain(), Ptr,
2978                          Ld->getPointerInfo(),
2979                          Ld->isVolatile(), Ld->isNonTemporal(),
2980                          Ld->isInvariant(), Ld->getAlignment());
2981
2982    EVT PtrType = Ptr.getValueType();
2983    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
2984    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
2985                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
2986    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2987                          Ld->getChain(), NewPtr,
2988                          Ld->getPointerInfo().getWithOffset(4),
2989                          Ld->isVolatile(), Ld->isNonTemporal(),
2990                          Ld->isInvariant(), NewAlign);
2991    return;
2992  }
2993
2994  llvm_unreachable("Unknown VFP cmp argument!");
2995}
2996
2997/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
2998/// f32 and even f64 comparisons to integer ones.
2999SDValue
3000ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
3001  SDValue Chain = Op.getOperand(0);
3002  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3003  SDValue LHS = Op.getOperand(2);
3004  SDValue RHS = Op.getOperand(3);
3005  SDValue Dest = Op.getOperand(4);
3006  DebugLoc dl = Op.getDebugLoc();
3007
3008  bool LHSSeenZero = false;
3009  bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
3010  bool RHSSeenZero = false;
3011  bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
3012  if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
3013    // If unsafe fp math optimization is enabled and there are no other uses of
3014    // the CMP operands, and the condition code is EQ or NE, we can optimize it
3015    // to an integer comparison.
3016    if (CC == ISD::SETOEQ)
3017      CC = ISD::SETEQ;
3018    else if (CC == ISD::SETUNE)
3019      CC = ISD::SETNE;
3020
3021    SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
3022    SDValue ARMcc;
3023    if (LHS.getValueType() == MVT::f32) {
3024      LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3025                        bitcastf32Toi32(LHS, DAG), Mask);
3026      RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
3027                        bitcastf32Toi32(RHS, DAG), Mask);
3028      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3029      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3030      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3031                         Chain, Dest, ARMcc, CCR, Cmp);
3032    }
3033
3034    SDValue LHS1, LHS2;
3035    SDValue RHS1, RHS2;
3036    expandf64Toi32(LHS, DAG, LHS1, LHS2);
3037    expandf64Toi32(RHS, DAG, RHS1, RHS2);
3038    LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
3039    RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
3040    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
3041    ARMcc = DAG.getConstant(CondCode, MVT::i32);
3042    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3043    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
3044    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
3045  }
3046
3047  return SDValue();
3048}
3049
3050SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3051  SDValue Chain = Op.getOperand(0);
3052  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3053  SDValue LHS = Op.getOperand(2);
3054  SDValue RHS = Op.getOperand(3);
3055  SDValue Dest = Op.getOperand(4);
3056  DebugLoc dl = Op.getDebugLoc();
3057
3058  if (LHS.getValueType() == MVT::i32) {
3059    SDValue ARMcc;
3060    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
3061    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3062    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
3063                       Chain, Dest, ARMcc, CCR, Cmp);
3064  }
3065
3066  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
3067
3068  if (getTargetMachine().Options.UnsafeFPMath &&
3069      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
3070       CC == ISD::SETNE || CC == ISD::SETUNE)) {
3071    SDValue Result = OptimizeVFPBrcond(Op, DAG);
3072    if (Result.getNode())
3073      return Result;
3074  }
3075
3076  ARMCC::CondCodes CondCode, CondCode2;
3077  FPCCToARMCC(CC, CondCode, CondCode2);
3078
3079  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
3080  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
3081  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3082  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
3083  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
3084  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3085  if (CondCode2 != ARMCC::AL) {
3086    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
3087    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
3088    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
3089  }
3090  return Res;
3091}
3092
3093SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3094  SDValue Chain = Op.getOperand(0);
3095  SDValue Table = Op.getOperand(1);
3096  SDValue Index = Op.getOperand(2);
3097  DebugLoc dl = Op.getDebugLoc();
3098
3099  EVT PTy = getPointerTy();
3100  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
3101  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3102  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
3103  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
3104  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
3105  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
3106  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
3107  if (Subtarget->isThumb2()) {
3108    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
3109    // which does another jump to the destination. This also makes it easier
3110    // to translate it to TBB / TBH later.
3111    // FIXME: This might not work if the function is extremely large.
3112    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
3113                       Addr, Op.getOperand(2), JTI, UId);
3114  }
3115  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
3116    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
3117                       MachinePointerInfo::getJumpTable(),
3118                       false, false, false, 0);
3119    Chain = Addr.getValue(1);
3120    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
3121    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3122  } else {
3123    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
3124                       MachinePointerInfo::getJumpTable(),
3125                       false, false, false, 0);
3126    Chain = Addr.getValue(1);
3127    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
3128  }
3129}
3130
3131static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3132  EVT VT = Op.getValueType();
3133  DebugLoc dl = Op.getDebugLoc();
3134
3135  if (Op.getValueType().getVectorElementType() == MVT::i32) {
3136    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
3137      return Op;
3138    return DAG.UnrollVectorOp(Op.getNode());
3139  }
3140
3141  assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
3142         "Invalid type for custom lowering!");
3143  if (VT != MVT::v4i16)
3144    return DAG.UnrollVectorOp(Op.getNode());
3145
3146  Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
3147  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
3148}
3149
3150static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
3151  EVT VT = Op.getValueType();
3152  if (VT.isVector())
3153    return LowerVectorFP_TO_INT(Op, DAG);
3154
3155  DebugLoc dl = Op.getDebugLoc();
3156  unsigned Opc;
3157
3158  switch (Op.getOpcode()) {
3159  default: llvm_unreachable("Invalid opcode!");
3160  case ISD::FP_TO_SINT:
3161    Opc = ARMISD::FTOSI;
3162    break;
3163  case ISD::FP_TO_UINT:
3164    Opc = ARMISD::FTOUI;
3165    break;
3166  }
3167  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
3168  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
3169}
3170
3171static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3172  EVT VT = Op.getValueType();
3173  DebugLoc dl = Op.getDebugLoc();
3174
3175  if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
3176    if (VT.getVectorElementType() == MVT::f32)
3177      return Op;
3178    return DAG.UnrollVectorOp(Op.getNode());
3179  }
3180
3181  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
3182         "Invalid type for custom lowering!");
3183  if (VT != MVT::v4f32)
3184    return DAG.UnrollVectorOp(Op.getNode());
3185
3186  unsigned CastOpc;
3187  unsigned Opc;
3188  switch (Op.getOpcode()) {
3189  default: llvm_unreachable("Invalid opcode!");
3190  case ISD::SINT_TO_FP:
3191    CastOpc = ISD::SIGN_EXTEND;
3192    Opc = ISD::SINT_TO_FP;
3193    break;
3194  case ISD::UINT_TO_FP:
3195    CastOpc = ISD::ZERO_EXTEND;
3196    Opc = ISD::UINT_TO_FP;
3197    break;
3198  }
3199
3200  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
3201  return DAG.getNode(Opc, dl, VT, Op);
3202}
3203
3204static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
3205  EVT VT = Op.getValueType();
3206  if (VT.isVector())
3207    return LowerVectorINT_TO_FP(Op, DAG);
3208
3209  DebugLoc dl = Op.getDebugLoc();
3210  unsigned Opc;
3211
3212  switch (Op.getOpcode()) {
3213  default: llvm_unreachable("Invalid opcode!");
3214  case ISD::SINT_TO_FP:
3215    Opc = ARMISD::SITOF;
3216    break;
3217  case ISD::UINT_TO_FP:
3218    Opc = ARMISD::UITOF;
3219    break;
3220  }
3221
3222  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
3223  return DAG.getNode(Opc, dl, VT, Op);
3224}
3225
3226SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
3227  // Implement fcopysign with a fabs and a conditional fneg.
3228  SDValue Tmp0 = Op.getOperand(0);
3229  SDValue Tmp1 = Op.getOperand(1);
3230  DebugLoc dl = Op.getDebugLoc();
3231  EVT VT = Op.getValueType();
3232  EVT SrcVT = Tmp1.getValueType();
3233  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
3234    Tmp0.getOpcode() == ARMISD::VMOVDRR;
3235  bool UseNEON = !InGPR && Subtarget->hasNEON();
3236
3237  if (UseNEON) {
3238    // Use VBSL to copy the sign bit.
3239    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
3240    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
3241                               DAG.getTargetConstant(EncodedVal, MVT::i32));
3242    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
3243    if (VT == MVT::f64)
3244      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3245                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
3246                         DAG.getConstant(32, MVT::i32));
3247    else /*if (VT == MVT::f32)*/
3248      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
3249    if (SrcVT == MVT::f32) {
3250      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
3251      if (VT == MVT::f64)
3252        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
3253                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
3254                           DAG.getConstant(32, MVT::i32));
3255    } else if (VT == MVT::f32)
3256      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
3257                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
3258                         DAG.getConstant(32, MVT::i32));
3259    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
3260    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
3261
3262    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
3263                                            MVT::i32);
3264    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
3265    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
3266                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
3267
3268    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
3269                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
3270                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
3271    if (VT == MVT::f32) {
3272      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
3273      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
3274                        DAG.getConstant(0, MVT::i32));
3275    } else {
3276      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
3277    }
3278
3279    return Res;
3280  }
3281
3282  // Bitcast operand 1 to i32.
3283  if (SrcVT == MVT::f64)
3284    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3285                       &Tmp1, 1).getValue(1);
3286  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
3287
3288  // Or in the signbit with integer operations.
3289  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
3290  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
3291  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
3292  if (VT == MVT::f32) {
3293    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
3294                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
3295    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
3296                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
3297  }
3298
3299  // f64: Or the high part with signbit and then combine two parts.
3300  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
3301                     &Tmp0, 1);
3302  SDValue Lo = Tmp0.getValue(0);
3303  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
3304  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
3305  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
3306}
3307
3308SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
3309  MachineFunction &MF = DAG.getMachineFunction();
3310  MachineFrameInfo *MFI = MF.getFrameInfo();
3311  MFI->setReturnAddressIsTaken(true);
3312
3313  EVT VT = Op.getValueType();
3314  DebugLoc dl = Op.getDebugLoc();
3315  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3316  if (Depth) {
3317    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
3318    SDValue Offset = DAG.getConstant(4, MVT::i32);
3319    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
3320                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
3321                       MachinePointerInfo(), false, false, false, 0);
3322  }
3323
3324  // Return LR, which contains the return address. Mark it an implicit live-in.
3325  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3326  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
3327}
3328
3329SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
3330  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3331  MFI->setFrameAddressIsTaken(true);
3332
3333  EVT VT = Op.getValueType();
3334  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
3335  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3336  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
3337    ? ARM::R7 : ARM::R11;
3338  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
3339  while (Depth--)
3340    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
3341                            MachinePointerInfo(),
3342                            false, false, false, 0);
3343  return FrameAddr;
3344}
3345
3346/// ExpandBITCAST - If the target supports VFP, this function is called to
3347/// expand a bit convert where either the source or destination type is i64 to
3348/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
3349/// operand type is illegal (e.g., v2f32 for a target that doesn't support
3350/// vectors), since the legalizer won't know what to do with that.
3351static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
3352  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3353  DebugLoc dl = N->getDebugLoc();
3354  SDValue Op = N->getOperand(0);
3355
3356  // This function is only supposed to be called for i64 types, either as the
3357  // source or destination of the bit convert.
3358  EVT SrcVT = Op.getValueType();
3359  EVT DstVT = N->getValueType(0);
3360  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
3361         "ExpandBITCAST called for non-i64 type");
3362
3363  // Turn i64->f64 into VMOVDRR.
3364  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
3365    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3366                             DAG.getConstant(0, MVT::i32));
3367    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3368                             DAG.getConstant(1, MVT::i32));
3369    return DAG.getNode(ISD::BITCAST, dl, DstVT,
3370                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
3371  }
3372
3373  // Turn f64->i64 into VMOVRRD.
3374  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
3375    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
3376                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
3377    // Merge the pieces into a single i64 value.
3378    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
3379  }
3380
3381  return SDValue();
3382}
3383
3384/// getZeroVector - Returns a vector of specified type with all zero elements.
3385/// Zero vectors are used to represent vector negation and in those cases
3386/// will be implemented with the NEON VNEG instruction.  However, VNEG does
3387/// not support i64 elements, so sometimes the zero vectors will need to be
3388/// explicitly constructed.  Regardless, use a canonical VMOV to create the
3389/// zero vector.
3390static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3391  assert(VT.isVector() && "Expected a vector type");
3392  // The canonical modified immediate encoding of a zero vector is....0!
3393  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
3394  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
3395  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
3396  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3397}
3398
3399/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
3400/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3401SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
3402                                                SelectionDAG &DAG) const {
3403  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3404  EVT VT = Op.getValueType();
3405  unsigned VTBits = VT.getSizeInBits();
3406  DebugLoc dl = Op.getDebugLoc();
3407  SDValue ShOpLo = Op.getOperand(0);
3408  SDValue ShOpHi = Op.getOperand(1);
3409  SDValue ShAmt  = Op.getOperand(2);
3410  SDValue ARMcc;
3411  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
3412
3413  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
3414
3415  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3416                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3417  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
3418  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3419                                   DAG.getConstant(VTBits, MVT::i32));
3420  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
3421  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3422  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
3423
3424  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3425  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3426                          ARMcc, DAG, dl);
3427  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
3428  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
3429                           CCR, Cmp);
3430
3431  SDValue Ops[2] = { Lo, Hi };
3432  return DAG.getMergeValues(Ops, 2, dl);
3433}
3434
3435/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
3436/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3437SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
3438                                               SelectionDAG &DAG) const {
3439  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3440  EVT VT = Op.getValueType();
3441  unsigned VTBits = VT.getSizeInBits();
3442  DebugLoc dl = Op.getDebugLoc();
3443  SDValue ShOpLo = Op.getOperand(0);
3444  SDValue ShOpHi = Op.getOperand(1);
3445  SDValue ShAmt  = Op.getOperand(2);
3446  SDValue ARMcc;
3447
3448  assert(Op.getOpcode() == ISD::SHL_PARTS);
3449  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3450                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3451  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
3452  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3453                                   DAG.getConstant(VTBits, MVT::i32));
3454  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
3455  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
3456
3457  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3458  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3459  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3460                          ARMcc, DAG, dl);
3461  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
3462  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
3463                           CCR, Cmp);
3464
3465  SDValue Ops[2] = { Lo, Hi };
3466  return DAG.getMergeValues(Ops, 2, dl);
3467}
3468
3469SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3470                                            SelectionDAG &DAG) const {
3471  // The rounding mode is in bits 23:22 of the FPSCR.
3472  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3473  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3474  // so that the shift + and get folded into a bitfield extract.
3475  DebugLoc dl = Op.getDebugLoc();
3476  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
3477                              DAG.getConstant(Intrinsic::arm_get_fpscr,
3478                                              MVT::i32));
3479  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
3480                                  DAG.getConstant(1U << 22, MVT::i32));
3481  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3482                              DAG.getConstant(22, MVT::i32));
3483  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3484                     DAG.getConstant(3, MVT::i32));
3485}
3486
3487static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
3488                         const ARMSubtarget *ST) {
3489  EVT VT = N->getValueType(0);
3490  DebugLoc dl = N->getDebugLoc();
3491
3492  if (!ST->hasV6T2Ops())
3493    return SDValue();
3494
3495  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
3496  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
3497}
3498
3499static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
3500                          const ARMSubtarget *ST) {
3501  EVT VT = N->getValueType(0);
3502  DebugLoc dl = N->getDebugLoc();
3503
3504  if (!VT.isVector())
3505    return SDValue();
3506
3507  // Lower vector shifts on NEON to use VSHL.
3508  assert(ST->hasNEON() && "unexpected vector shift");
3509
3510  // Left shifts translate directly to the vshiftu intrinsic.
3511  if (N->getOpcode() == ISD::SHL)
3512    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3513                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
3514                       N->getOperand(0), N->getOperand(1));
3515
3516  assert((N->getOpcode() == ISD::SRA ||
3517          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
3518
3519  // NEON uses the same intrinsics for both left and right shifts.  For
3520  // right shifts, the shift amounts are negative, so negate the vector of
3521  // shift amounts.
3522  EVT ShiftVT = N->getOperand(1).getValueType();
3523  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
3524                                     getZeroVector(ShiftVT, DAG, dl),
3525                                     N->getOperand(1));
3526  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
3527                             Intrinsic::arm_neon_vshifts :
3528                             Intrinsic::arm_neon_vshiftu);
3529  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3530                     DAG.getConstant(vshiftInt, MVT::i32),
3531                     N->getOperand(0), NegatedCount);
3532}
3533
3534static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
3535                                const ARMSubtarget *ST) {
3536  EVT VT = N->getValueType(0);
3537  DebugLoc dl = N->getDebugLoc();
3538
3539  // We can get here for a node like i32 = ISD::SHL i32, i64
3540  if (VT != MVT::i64)
3541    return SDValue();
3542
3543  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
3544         "Unknown shift to lower!");
3545
3546  // We only lower SRA, SRL of 1 here, all others use generic lowering.
3547  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
3548      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
3549    return SDValue();
3550
3551  // If we are in thumb mode, we don't have RRX.
3552  if (ST->isThumb1Only()) return SDValue();
3553
3554  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
3555  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3556                           DAG.getConstant(0, MVT::i32));
3557  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3558                           DAG.getConstant(1, MVT::i32));
3559
3560  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
3561  // captures the result into a carry flag.
3562  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
3563  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
3564
3565  // The low part is an ARMISD::RRX operand, which shifts the carry in.
3566  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
3567
3568  // Merge the pieces into a single i64 value.
3569 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
3570}
3571
3572static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
3573  SDValue TmpOp0, TmpOp1;
3574  bool Invert = false;
3575  bool Swap = false;
3576  unsigned Opc = 0;
3577
3578  SDValue Op0 = Op.getOperand(0);
3579  SDValue Op1 = Op.getOperand(1);
3580  SDValue CC = Op.getOperand(2);
3581  EVT VT = Op.getValueType();
3582  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
3583  DebugLoc dl = Op.getDebugLoc();
3584
3585  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
3586    switch (SetCCOpcode) {
3587    default: llvm_unreachable("Illegal FP comparison");
3588    case ISD::SETUNE:
3589    case ISD::SETNE:  Invert = true; // Fallthrough
3590    case ISD::SETOEQ:
3591    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
3592    case ISD::SETOLT:
3593    case ISD::SETLT: Swap = true; // Fallthrough
3594    case ISD::SETOGT:
3595    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
3596    case ISD::SETOLE:
3597    case ISD::SETLE:  Swap = true; // Fallthrough
3598    case ISD::SETOGE:
3599    case ISD::SETGE: Opc = ARMISD::VCGE; break;
3600    case ISD::SETUGE: Swap = true; // Fallthrough
3601    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
3602    case ISD::SETUGT: Swap = true; // Fallthrough
3603    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
3604    case ISD::SETUEQ: Invert = true; // Fallthrough
3605    case ISD::SETONE:
3606      // Expand this to (OLT | OGT).
3607      TmpOp0 = Op0;
3608      TmpOp1 = Op1;
3609      Opc = ISD::OR;
3610      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
3611      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
3612      break;
3613    case ISD::SETUO: Invert = true; // Fallthrough
3614    case ISD::SETO:
3615      // Expand this to (OLT | OGE).
3616      TmpOp0 = Op0;
3617      TmpOp1 = Op1;
3618      Opc = ISD::OR;
3619      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
3620      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
3621      break;
3622    }
3623  } else {
3624    // Integer comparisons.
3625    switch (SetCCOpcode) {
3626    default: llvm_unreachable("Illegal integer comparison");
3627    case ISD::SETNE:  Invert = true;
3628    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
3629    case ISD::SETLT:  Swap = true;
3630    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
3631    case ISD::SETLE:  Swap = true;
3632    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
3633    case ISD::SETULT: Swap = true;
3634    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
3635    case ISD::SETULE: Swap = true;
3636    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
3637    }
3638
3639    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
3640    if (Opc == ARMISD::VCEQ) {
3641
3642      SDValue AndOp;
3643      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
3644        AndOp = Op0;
3645      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
3646        AndOp = Op1;
3647
3648      // Ignore bitconvert.
3649      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
3650        AndOp = AndOp.getOperand(0);
3651
3652      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
3653        Opc = ARMISD::VTST;
3654        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
3655        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
3656        Invert = !Invert;
3657      }
3658    }
3659  }
3660
3661  if (Swap)
3662    std::swap(Op0, Op1);
3663
3664  // If one of the operands is a constant vector zero, attempt to fold the
3665  // comparison to a specialized compare-against-zero form.
3666  SDValue SingleOp;
3667  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
3668    SingleOp = Op0;
3669  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
3670    if (Opc == ARMISD::VCGE)
3671      Opc = ARMISD::VCLEZ;
3672    else if (Opc == ARMISD::VCGT)
3673      Opc = ARMISD::VCLTZ;
3674    SingleOp = Op1;
3675  }
3676
3677  SDValue Result;
3678  if (SingleOp.getNode()) {
3679    switch (Opc) {
3680    case ARMISD::VCEQ:
3681      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
3682    case ARMISD::VCGE:
3683      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
3684    case ARMISD::VCLEZ:
3685      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
3686    case ARMISD::VCGT:
3687      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
3688    case ARMISD::VCLTZ:
3689      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
3690    default:
3691      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
3692    }
3693  } else {
3694     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
3695  }
3696
3697  if (Invert)
3698    Result = DAG.getNOT(dl, Result, VT);
3699
3700  return Result;
3701}
3702
3703/// isNEONModifiedImm - Check if the specified splat value corresponds to a
3704/// valid vector constant for a NEON instruction with a "modified immediate"
3705/// operand (e.g., VMOV).  If so, return the encoded value.
3706static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
3707                                 unsigned SplatBitSize, SelectionDAG &DAG,
3708                                 EVT &VT, bool is128Bits, NEONModImmType type) {
3709  unsigned OpCmode, Imm;
3710
3711  // SplatBitSize is set to the smallest size that splats the vector, so a
3712  // zero vector will always have SplatBitSize == 8.  However, NEON modified
3713  // immediate instructions others than VMOV do not support the 8-bit encoding
3714  // of a zero vector, and the default encoding of zero is supposed to be the
3715  // 32-bit version.
3716  if (SplatBits == 0)
3717    SplatBitSize = 32;
3718
3719  switch (SplatBitSize) {
3720  case 8:
3721    if (type != VMOVModImm)
3722      return SDValue();
3723    // Any 1-byte value is OK.  Op=0, Cmode=1110.
3724    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
3725    OpCmode = 0xe;
3726    Imm = SplatBits;
3727    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
3728    break;
3729
3730  case 16:
3731    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
3732    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
3733    if ((SplatBits & ~0xff) == 0) {
3734      // Value = 0x00nn: Op=x, Cmode=100x.
3735      OpCmode = 0x8;
3736      Imm = SplatBits;
3737      break;
3738    }
3739    if ((SplatBits & ~0xff00) == 0) {
3740      // Value = 0xnn00: Op=x, Cmode=101x.
3741      OpCmode = 0xa;
3742      Imm = SplatBits >> 8;
3743      break;
3744    }
3745    return SDValue();
3746
3747  case 32:
3748    // NEON's 32-bit VMOV supports splat values where:
3749    // * only one byte is nonzero, or
3750    // * the least significant byte is 0xff and the second byte is nonzero, or
3751    // * the least significant 2 bytes are 0xff and the third is nonzero.
3752    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
3753    if ((SplatBits & ~0xff) == 0) {
3754      // Value = 0x000000nn: Op=x, Cmode=000x.
3755      OpCmode = 0;
3756      Imm = SplatBits;
3757      break;
3758    }
3759    if ((SplatBits & ~0xff00) == 0) {
3760      // Value = 0x0000nn00: Op=x, Cmode=001x.
3761      OpCmode = 0x2;
3762      Imm = SplatBits >> 8;
3763      break;
3764    }
3765    if ((SplatBits & ~0xff0000) == 0) {
3766      // Value = 0x00nn0000: Op=x, Cmode=010x.
3767      OpCmode = 0x4;
3768      Imm = SplatBits >> 16;
3769      break;
3770    }
3771    if ((SplatBits & ~0xff000000) == 0) {
3772      // Value = 0xnn000000: Op=x, Cmode=011x.
3773      OpCmode = 0x6;
3774      Imm = SplatBits >> 24;
3775      break;
3776    }
3777
3778    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
3779    if (type == OtherModImm) return SDValue();
3780
3781    if ((SplatBits & ~0xffff) == 0 &&
3782        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
3783      // Value = 0x0000nnff: Op=x, Cmode=1100.
3784      OpCmode = 0xc;
3785      Imm = SplatBits >> 8;
3786      SplatBits |= 0xff;
3787      break;
3788    }
3789
3790    if ((SplatBits & ~0xffffff) == 0 &&
3791        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
3792      // Value = 0x00nnffff: Op=x, Cmode=1101.
3793      OpCmode = 0xd;
3794      Imm = SplatBits >> 16;
3795      SplatBits |= 0xffff;
3796      break;
3797    }
3798
3799    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
3800    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
3801    // VMOV.I32.  A (very) minor optimization would be to replicate the value
3802    // and fall through here to test for a valid 64-bit splat.  But, then the
3803    // caller would also need to check and handle the change in size.
3804    return SDValue();
3805
3806  case 64: {
3807    if (type != VMOVModImm)
3808      return SDValue();
3809    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
3810    uint64_t BitMask = 0xff;
3811    uint64_t Val = 0;
3812    unsigned ImmMask = 1;
3813    Imm = 0;
3814    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
3815      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
3816        Val |= BitMask;
3817        Imm |= ImmMask;
3818      } else if ((SplatBits & BitMask) != 0) {
3819        return SDValue();
3820      }
3821      BitMask <<= 8;
3822      ImmMask <<= 1;
3823    }
3824    // Op=1, Cmode=1110.
3825    OpCmode = 0x1e;
3826    SplatBits = Val;
3827    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
3828    break;
3829  }
3830
3831  default:
3832    llvm_unreachable("unexpected size for isNEONModifiedImm");
3833  }
3834
3835  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
3836  return DAG.getTargetConstant(EncodedVal, MVT::i32);
3837}
3838
3839SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
3840                                           const ARMSubtarget *ST) const {
3841  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
3842    return SDValue();
3843
3844  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
3845  assert(Op.getValueType() == MVT::f32 &&
3846         "ConstantFP custom lowering should only occur for f32.");
3847
3848  // Try splatting with a VMOV.f32...
3849  APFloat FPVal = CFP->getValueAPF();
3850  int ImmVal = ARM_AM::getFP32Imm(FPVal);
3851  if (ImmVal != -1) {
3852    DebugLoc DL = Op.getDebugLoc();
3853    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
3854    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
3855                                      NewVal);
3856    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
3857                       DAG.getConstant(0, MVT::i32));
3858  }
3859
3860  // If that fails, try a VMOV.i32
3861  EVT VMovVT;
3862  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
3863  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
3864                                     VMOVModImm);
3865  if (NewVal != SDValue()) {
3866    DebugLoc DL = Op.getDebugLoc();
3867    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
3868                                      NewVal);
3869    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
3870                                       VecConstant);
3871    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
3872                       DAG.getConstant(0, MVT::i32));
3873  }
3874
3875  // Finally, try a VMVN.i32
3876  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
3877                             VMVNModImm);
3878  if (NewVal != SDValue()) {
3879    DebugLoc DL = Op.getDebugLoc();
3880    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
3881    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
3882                                       VecConstant);
3883    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
3884                       DAG.getConstant(0, MVT::i32));
3885  }
3886
3887  return SDValue();
3888}
3889
3890
3891static bool isVEXTMask(ArrayRef<int> M, EVT VT,
3892                       bool &ReverseVEXT, unsigned &Imm) {
3893  unsigned NumElts = VT.getVectorNumElements();
3894  ReverseVEXT = false;
3895
3896  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
3897  if (M[0] < 0)
3898    return false;
3899
3900  Imm = M[0];
3901
3902  // If this is a VEXT shuffle, the immediate value is the index of the first
3903  // element.  The other shuffle indices must be the successive elements after
3904  // the first one.
3905  unsigned ExpectedElt = Imm;
3906  for (unsigned i = 1; i < NumElts; ++i) {
3907    // Increment the expected index.  If it wraps around, it may still be
3908    // a VEXT but the source vectors must be swapped.
3909    ExpectedElt += 1;
3910    if (ExpectedElt == NumElts * 2) {
3911      ExpectedElt = 0;
3912      ReverseVEXT = true;
3913    }
3914
3915    if (M[i] < 0) continue; // ignore UNDEF indices
3916    if (ExpectedElt != static_cast<unsigned>(M[i]))
3917      return false;
3918  }
3919
3920  // Adjust the index value if the source operands will be swapped.
3921  if (ReverseVEXT)
3922    Imm -= NumElts;
3923
3924  return true;
3925}
3926
3927/// isVREVMask - Check if a vector shuffle corresponds to a VREV
3928/// instruction with the specified blocksize.  (The order of the elements
3929/// within each block of the vector is reversed.)
3930static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
3931  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
3932         "Only possible block sizes for VREV are: 16, 32, 64");
3933
3934  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3935  if (EltSz == 64)
3936    return false;
3937
3938  unsigned NumElts = VT.getVectorNumElements();
3939  unsigned BlockElts = M[0] + 1;
3940  // If the first shuffle index is UNDEF, be optimistic.
3941  if (M[0] < 0)
3942    BlockElts = BlockSize / EltSz;
3943
3944  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
3945    return false;
3946
3947  for (unsigned i = 0; i < NumElts; ++i) {
3948    if (M[i] < 0) continue; // ignore UNDEF indices
3949    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
3950      return false;
3951  }
3952
3953  return true;
3954}
3955
3956static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
3957  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
3958  // range, then 0 is placed into the resulting vector. So pretty much any mask
3959  // of 8 elements can work here.
3960  return VT == MVT::v8i8 && M.size() == 8;
3961}
3962
3963static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
3964  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3965  if (EltSz == 64)
3966    return false;
3967
3968  unsigned NumElts = VT.getVectorNumElements();
3969  WhichResult = (M[0] == 0 ? 0 : 1);
3970  for (unsigned i = 0; i < NumElts; i += 2) {
3971    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
3972        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
3973      return false;
3974  }
3975  return true;
3976}
3977
3978/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
3979/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
3980/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
3981static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
3982  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3983  if (EltSz == 64)
3984    return false;
3985
3986  unsigned NumElts = VT.getVectorNumElements();
3987  WhichResult = (M[0] == 0 ? 0 : 1);
3988  for (unsigned i = 0; i < NumElts; i += 2) {
3989    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
3990        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
3991      return false;
3992  }
3993  return true;
3994}
3995
3996static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
3997  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3998  if (EltSz == 64)
3999    return false;
4000
4001  unsigned NumElts = VT.getVectorNumElements();
4002  WhichResult = (M[0] == 0 ? 0 : 1);
4003  for (unsigned i = 0; i != NumElts; ++i) {
4004    if (M[i] < 0) continue; // ignore UNDEF indices
4005    if ((unsigned) M[i] != 2 * i + WhichResult)
4006      return false;
4007  }
4008
4009  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4010  if (VT.is64BitVector() && EltSz == 32)
4011    return false;
4012
4013  return true;
4014}
4015
4016/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
4017/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4018/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
4019static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4020  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4021  if (EltSz == 64)
4022    return false;
4023
4024  unsigned Half = VT.getVectorNumElements() / 2;
4025  WhichResult = (M[0] == 0 ? 0 : 1);
4026  for (unsigned j = 0; j != 2; ++j) {
4027    unsigned Idx = WhichResult;
4028    for (unsigned i = 0; i != Half; ++i) {
4029      int MIdx = M[i + j * Half];
4030      if (MIdx >= 0 && (unsigned) MIdx != Idx)
4031        return false;
4032      Idx += 2;
4033    }
4034  }
4035
4036  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4037  if (VT.is64BitVector() && EltSz == 32)
4038    return false;
4039
4040  return true;
4041}
4042
4043static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
4044  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4045  if (EltSz == 64)
4046    return false;
4047
4048  unsigned NumElts = VT.getVectorNumElements();
4049  WhichResult = (M[0] == 0 ? 0 : 1);
4050  unsigned Idx = WhichResult * NumElts / 2;
4051  for (unsigned i = 0; i != NumElts; i += 2) {
4052    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4053        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
4054      return false;
4055    Idx += 1;
4056  }
4057
4058  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4059  if (VT.is64BitVector() && EltSz == 32)
4060    return false;
4061
4062  return true;
4063}
4064
4065/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
4066/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
4067/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
4068static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
4069  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
4070  if (EltSz == 64)
4071    return false;
4072
4073  unsigned NumElts = VT.getVectorNumElements();
4074  WhichResult = (M[0] == 0 ? 0 : 1);
4075  unsigned Idx = WhichResult * NumElts / 2;
4076  for (unsigned i = 0; i != NumElts; i += 2) {
4077    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
4078        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
4079      return false;
4080    Idx += 1;
4081  }
4082
4083  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
4084  if (VT.is64BitVector() && EltSz == 32)
4085    return false;
4086
4087  return true;
4088}
4089
4090// If N is an integer constant that can be moved into a register in one
4091// instruction, return an SDValue of such a constant (will become a MOV
4092// instruction).  Otherwise return null.
4093static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
4094                                     const ARMSubtarget *ST, DebugLoc dl) {
4095  uint64_t Val;
4096  if (!isa<ConstantSDNode>(N))
4097    return SDValue();
4098  Val = cast<ConstantSDNode>(N)->getZExtValue();
4099
4100  if (ST->isThumb1Only()) {
4101    if (Val <= 255 || ~Val <= 255)
4102      return DAG.getConstant(Val, MVT::i32);
4103  } else {
4104    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
4105      return DAG.getConstant(Val, MVT::i32);
4106  }
4107  return SDValue();
4108}
4109
4110// If this is a case we can't handle, return null and let the default
4111// expansion code take care of it.
4112SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
4113                                             const ARMSubtarget *ST) const {
4114  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
4115  DebugLoc dl = Op.getDebugLoc();
4116  EVT VT = Op.getValueType();
4117
4118  APInt SplatBits, SplatUndef;
4119  unsigned SplatBitSize;
4120  bool HasAnyUndefs;
4121  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
4122    if (SplatBitSize <= 64) {
4123      // Check if an immediate VMOV works.
4124      EVT VmovVT;
4125      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
4126                                      SplatUndef.getZExtValue(), SplatBitSize,
4127                                      DAG, VmovVT, VT.is128BitVector(),
4128                                      VMOVModImm);
4129      if (Val.getNode()) {
4130        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
4131        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4132      }
4133
4134      // Try an immediate VMVN.
4135      uint64_t NegatedImm = (~SplatBits).getZExtValue();
4136      Val = isNEONModifiedImm(NegatedImm,
4137                                      SplatUndef.getZExtValue(), SplatBitSize,
4138                                      DAG, VmovVT, VT.is128BitVector(),
4139                                      VMVNModImm);
4140      if (Val.getNode()) {
4141        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
4142        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
4143      }
4144
4145      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
4146      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
4147        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
4148        if (ImmVal != -1) {
4149          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
4150          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
4151        }
4152      }
4153    }
4154  }
4155
4156  // Scan through the operands to see if only one value is used.
4157  unsigned NumElts = VT.getVectorNumElements();
4158  bool isOnlyLowElement = true;
4159  bool usesOnlyOneValue = true;
4160  bool isConstant = true;
4161  SDValue Value;
4162  for (unsigned i = 0; i < NumElts; ++i) {
4163    SDValue V = Op.getOperand(i);
4164    if (V.getOpcode() == ISD::UNDEF)
4165      continue;
4166    if (i > 0)
4167      isOnlyLowElement = false;
4168    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
4169      isConstant = false;
4170
4171    if (!Value.getNode())
4172      Value = V;
4173    else if (V != Value)
4174      usesOnlyOneValue = false;
4175  }
4176
4177  if (!Value.getNode())
4178    return DAG.getUNDEF(VT);
4179
4180  if (isOnlyLowElement)
4181    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
4182
4183  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4184
4185  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
4186  // i32 and try again.
4187  if (usesOnlyOneValue && EltSize <= 32) {
4188    if (!isConstant)
4189      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
4190    if (VT.getVectorElementType().isFloatingPoint()) {
4191      SmallVector<SDValue, 8> Ops;
4192      for (unsigned i = 0; i < NumElts; ++i)
4193        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4194                                  Op.getOperand(i)));
4195      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
4196      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
4197      Val = LowerBUILD_VECTOR(Val, DAG, ST);
4198      if (Val.getNode())
4199        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4200    }
4201    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
4202    if (Val.getNode())
4203      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
4204  }
4205
4206  // If all elements are constants and the case above didn't get hit, fall back
4207  // to the default expansion, which will generate a load from the constant
4208  // pool.
4209  if (isConstant)
4210    return SDValue();
4211
4212  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
4213  if (NumElts >= 4) {
4214    SDValue shuffle = ReconstructShuffle(Op, DAG);
4215    if (shuffle != SDValue())
4216      return shuffle;
4217  }
4218
4219  // Vectors with 32- or 64-bit elements can be built by directly assigning
4220  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
4221  // will be legalized.
4222  if (EltSize >= 32) {
4223    // Do the expansion with floating-point types, since that is what the VFP
4224    // registers are defined to use, and since i64 is not legal.
4225    EVT EltVT = EVT::getFloatingPointVT(EltSize);
4226    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
4227    SmallVector<SDValue, 8> Ops;
4228    for (unsigned i = 0; i < NumElts; ++i)
4229      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
4230    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
4231    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4232  }
4233
4234  return SDValue();
4235}
4236
4237// Gather data to see if the operation can be modelled as a
4238// shuffle in combination with VEXTs.
4239SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
4240                                              SelectionDAG &DAG) const {
4241  DebugLoc dl = Op.getDebugLoc();
4242  EVT VT = Op.getValueType();
4243  unsigned NumElts = VT.getVectorNumElements();
4244
4245  SmallVector<SDValue, 2> SourceVecs;
4246  SmallVector<unsigned, 2> MinElts;
4247  SmallVector<unsigned, 2> MaxElts;
4248
4249  for (unsigned i = 0; i < NumElts; ++i) {
4250    SDValue V = Op.getOperand(i);
4251    if (V.getOpcode() == ISD::UNDEF)
4252      continue;
4253    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
4254      // A shuffle can only come from building a vector from various
4255      // elements of other vectors.
4256      return SDValue();
4257    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
4258               VT.getVectorElementType()) {
4259      // This code doesn't know how to handle shuffles where the vector
4260      // element types do not match (this happens because type legalization
4261      // promotes the return type of EXTRACT_VECTOR_ELT).
4262      // FIXME: It might be appropriate to extend this code to handle
4263      // mismatched types.
4264      return SDValue();
4265    }
4266
4267    // Record this extraction against the appropriate vector if possible...
4268    SDValue SourceVec = V.getOperand(0);
4269    // If the element number isn't a constant, we can't effectively
4270    // analyze what's going on.
4271    if (!isa<ConstantSDNode>(V.getOperand(1)))
4272      return SDValue();
4273    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
4274    bool FoundSource = false;
4275    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
4276      if (SourceVecs[j] == SourceVec) {
4277        if (MinElts[j] > EltNo)
4278          MinElts[j] = EltNo;
4279        if (MaxElts[j] < EltNo)
4280          MaxElts[j] = EltNo;
4281        FoundSource = true;
4282        break;
4283      }
4284    }
4285
4286    // Or record a new source if not...
4287    if (!FoundSource) {
4288      SourceVecs.push_back(SourceVec);
4289      MinElts.push_back(EltNo);
4290      MaxElts.push_back(EltNo);
4291    }
4292  }
4293
4294  // Currently only do something sane when at most two source vectors
4295  // involved.
4296  if (SourceVecs.size() > 2)
4297    return SDValue();
4298
4299  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
4300  int VEXTOffsets[2] = {0, 0};
4301
4302  // This loop extracts the usage patterns of the source vectors
4303  // and prepares appropriate SDValues for a shuffle if possible.
4304  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
4305    if (SourceVecs[i].getValueType() == VT) {
4306      // No VEXT necessary
4307      ShuffleSrcs[i] = SourceVecs[i];
4308      VEXTOffsets[i] = 0;
4309      continue;
4310    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
4311      // It probably isn't worth padding out a smaller vector just to
4312      // break it down again in a shuffle.
4313      return SDValue();
4314    }
4315
4316    // Since only 64-bit and 128-bit vectors are legal on ARM and
4317    // we've eliminated the other cases...
4318    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
4319           "unexpected vector sizes in ReconstructShuffle");
4320
4321    if (MaxElts[i] - MinElts[i] >= NumElts) {
4322      // Span too large for a VEXT to cope
4323      return SDValue();
4324    }
4325
4326    if (MinElts[i] >= NumElts) {
4327      // The extraction can just take the second half
4328      VEXTOffsets[i] = NumElts;
4329      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4330                                   SourceVecs[i],
4331                                   DAG.getIntPtrConstant(NumElts));
4332    } else if (MaxElts[i] < NumElts) {
4333      // The extraction can just take the first half
4334      VEXTOffsets[i] = 0;
4335      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4336                                   SourceVecs[i],
4337                                   DAG.getIntPtrConstant(0));
4338    } else {
4339      // An actual VEXT is needed
4340      VEXTOffsets[i] = MinElts[i];
4341      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4342                                     SourceVecs[i],
4343                                     DAG.getIntPtrConstant(0));
4344      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
4345                                     SourceVecs[i],
4346                                     DAG.getIntPtrConstant(NumElts));
4347      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
4348                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
4349    }
4350  }
4351
4352  SmallVector<int, 8> Mask;
4353
4354  for (unsigned i = 0; i < NumElts; ++i) {
4355    SDValue Entry = Op.getOperand(i);
4356    if (Entry.getOpcode() == ISD::UNDEF) {
4357      Mask.push_back(-1);
4358      continue;
4359    }
4360
4361    SDValue ExtractVec = Entry.getOperand(0);
4362    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
4363                                          .getOperand(1))->getSExtValue();
4364    if (ExtractVec == SourceVecs[0]) {
4365      Mask.push_back(ExtractElt - VEXTOffsets[0]);
4366    } else {
4367      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
4368    }
4369  }
4370
4371  // Final check before we try to produce nonsense...
4372  if (isShuffleMaskLegal(Mask, VT))
4373    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
4374                                &Mask[0]);
4375
4376  return SDValue();
4377}
4378
4379/// isShuffleMaskLegal - Targets can use this to indicate that they only
4380/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
4381/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
4382/// are assumed to be legal.
4383bool
4384ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
4385                                      EVT VT) const {
4386  if (VT.getVectorNumElements() == 4 &&
4387      (VT.is128BitVector() || VT.is64BitVector())) {
4388    unsigned PFIndexes[4];
4389    for (unsigned i = 0; i != 4; ++i) {
4390      if (M[i] < 0)
4391        PFIndexes[i] = 8;
4392      else
4393        PFIndexes[i] = M[i];
4394    }
4395
4396    // Compute the index in the perfect shuffle table.
4397    unsigned PFTableIndex =
4398      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
4399    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4400    unsigned Cost = (PFEntry >> 30);
4401
4402    if (Cost <= 4)
4403      return true;
4404  }
4405
4406  bool ReverseVEXT;
4407  unsigned Imm, WhichResult;
4408
4409  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4410  return (EltSize >= 32 ||
4411          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
4412          isVREVMask(M, VT, 64) ||
4413          isVREVMask(M, VT, 32) ||
4414          isVREVMask(M, VT, 16) ||
4415          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
4416          isVTBLMask(M, VT) ||
4417          isVTRNMask(M, VT, WhichResult) ||
4418          isVUZPMask(M, VT, WhichResult) ||
4419          isVZIPMask(M, VT, WhichResult) ||
4420          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
4421          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
4422          isVZIP_v_undef_Mask(M, VT, WhichResult));
4423}
4424
4425/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
4426/// the specified operations to build the shuffle.
4427static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
4428                                      SDValue RHS, SelectionDAG &DAG,
4429                                      DebugLoc dl) {
4430  unsigned OpNum = (PFEntry >> 26) & 0x0F;
4431  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
4432  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
4433
4434  enum {
4435    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
4436    OP_VREV,
4437    OP_VDUP0,
4438    OP_VDUP1,
4439    OP_VDUP2,
4440    OP_VDUP3,
4441    OP_VEXT1,
4442    OP_VEXT2,
4443    OP_VEXT3,
4444    OP_VUZPL, // VUZP, left result
4445    OP_VUZPR, // VUZP, right result
4446    OP_VZIPL, // VZIP, left result
4447    OP_VZIPR, // VZIP, right result
4448    OP_VTRNL, // VTRN, left result
4449    OP_VTRNR  // VTRN, right result
4450  };
4451
4452  if (OpNum == OP_COPY) {
4453    if (LHSID == (1*9+2)*9+3) return LHS;
4454    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
4455    return RHS;
4456  }
4457
4458  SDValue OpLHS, OpRHS;
4459  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
4460  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
4461  EVT VT = OpLHS.getValueType();
4462
4463  switch (OpNum) {
4464  default: llvm_unreachable("Unknown shuffle opcode!");
4465  case OP_VREV:
4466    // VREV divides the vector in half and swaps within the half.
4467    if (VT.getVectorElementType() == MVT::i32 ||
4468        VT.getVectorElementType() == MVT::f32)
4469      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
4470    // vrev <4 x i16> -> VREV32
4471    if (VT.getVectorElementType() == MVT::i16)
4472      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
4473    // vrev <4 x i8> -> VREV16
4474    assert(VT.getVectorElementType() == MVT::i8);
4475    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
4476  case OP_VDUP0:
4477  case OP_VDUP1:
4478  case OP_VDUP2:
4479  case OP_VDUP3:
4480    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4481                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
4482  case OP_VEXT1:
4483  case OP_VEXT2:
4484  case OP_VEXT3:
4485    return DAG.getNode(ARMISD::VEXT, dl, VT,
4486                       OpLHS, OpRHS,
4487                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
4488  case OP_VUZPL:
4489  case OP_VUZPR:
4490    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4491                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
4492  case OP_VZIPL:
4493  case OP_VZIPR:
4494    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4495                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
4496  case OP_VTRNL:
4497  case OP_VTRNR:
4498    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4499                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
4500  }
4501}
4502
4503static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
4504                                       ArrayRef<int> ShuffleMask,
4505                                       SelectionDAG &DAG) {
4506  // Check to see if we can use the VTBL instruction.
4507  SDValue V1 = Op.getOperand(0);
4508  SDValue V2 = Op.getOperand(1);
4509  DebugLoc DL = Op.getDebugLoc();
4510
4511  SmallVector<SDValue, 8> VTBLMask;
4512  for (ArrayRef<int>::iterator
4513         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
4514    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
4515
4516  if (V2.getNode()->getOpcode() == ISD::UNDEF)
4517    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
4518                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
4519                                   &VTBLMask[0], 8));
4520
4521  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
4522                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
4523                                 &VTBLMask[0], 8));
4524}
4525
4526static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4527  SDValue V1 = Op.getOperand(0);
4528  SDValue V2 = Op.getOperand(1);
4529  DebugLoc dl = Op.getDebugLoc();
4530  EVT VT = Op.getValueType();
4531  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4532
4533  // Convert shuffles that are directly supported on NEON to target-specific
4534  // DAG nodes, instead of keeping them as shuffles and matching them again
4535  // during code selection.  This is more efficient and avoids the possibility
4536  // of inconsistencies between legalization and selection.
4537  // FIXME: floating-point vectors should be canonicalized to integer vectors
4538  // of the same time so that they get CSEd properly.
4539  ArrayRef<int> ShuffleMask = SVN->getMask();
4540
4541  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4542  if (EltSize <= 32) {
4543    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4544      int Lane = SVN->getSplatIndex();
4545      // If this is undef splat, generate it via "just" vdup, if possible.
4546      if (Lane == -1) Lane = 0;
4547
4548      // Test if V1 is a SCALAR_TO_VECTOR.
4549      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4550        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
4551      }
4552      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
4553      // (and probably will turn into a SCALAR_TO_VECTOR once legalization
4554      // reaches it).
4555      if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
4556          !isa<ConstantSDNode>(V1.getOperand(0))) {
4557        bool IsScalarToVector = true;
4558        for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
4559          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
4560            IsScalarToVector = false;
4561            break;
4562          }
4563        if (IsScalarToVector)
4564          return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
4565      }
4566      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
4567                         DAG.getConstant(Lane, MVT::i32));
4568    }
4569
4570    bool ReverseVEXT;
4571    unsigned Imm;
4572    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
4573      if (ReverseVEXT)
4574        std::swap(V1, V2);
4575      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
4576                         DAG.getConstant(Imm, MVT::i32));
4577    }
4578
4579    if (isVREVMask(ShuffleMask, VT, 64))
4580      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
4581    if (isVREVMask(ShuffleMask, VT, 32))
4582      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
4583    if (isVREVMask(ShuffleMask, VT, 16))
4584      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
4585
4586    // Check for Neon shuffles that modify both input vectors in place.
4587    // If both results are used, i.e., if there are two shuffles with the same
4588    // source operands and with masks corresponding to both results of one of
4589    // these operations, DAG memoization will ensure that a single node is
4590    // used for both shuffles.
4591    unsigned WhichResult;
4592    if (isVTRNMask(ShuffleMask, VT, WhichResult))
4593      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4594                         V1, V2).getValue(WhichResult);
4595    if (isVUZPMask(ShuffleMask, VT, WhichResult))
4596      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4597                         V1, V2).getValue(WhichResult);
4598    if (isVZIPMask(ShuffleMask, VT, WhichResult))
4599      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4600                         V1, V2).getValue(WhichResult);
4601
4602    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
4603      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4604                         V1, V1).getValue(WhichResult);
4605    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
4606      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4607                         V1, V1).getValue(WhichResult);
4608    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
4609      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4610                         V1, V1).getValue(WhichResult);
4611  }
4612
4613  // If the shuffle is not directly supported and it has 4 elements, use
4614  // the PerfectShuffle-generated table to synthesize it from other shuffles.
4615  unsigned NumElts = VT.getVectorNumElements();
4616  if (NumElts == 4) {
4617    unsigned PFIndexes[4];
4618    for (unsigned i = 0; i != 4; ++i) {
4619      if (ShuffleMask[i] < 0)
4620        PFIndexes[i] = 8;
4621      else
4622        PFIndexes[i] = ShuffleMask[i];
4623    }
4624
4625    // Compute the index in the perfect shuffle table.
4626    unsigned PFTableIndex =
4627      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
4628    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4629    unsigned Cost = (PFEntry >> 30);
4630
4631    if (Cost <= 4)
4632      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
4633  }
4634
4635  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
4636  if (EltSize >= 32) {
4637    // Do the expansion with floating-point types, since that is what the VFP
4638    // registers are defined to use, and since i64 is not legal.
4639    EVT EltVT = EVT::getFloatingPointVT(EltSize);
4640    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
4641    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
4642    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
4643    SmallVector<SDValue, 8> Ops;
4644    for (unsigned i = 0; i < NumElts; ++i) {
4645      if (ShuffleMask[i] < 0)
4646        Ops.push_back(DAG.getUNDEF(EltVT));
4647      else
4648        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
4649                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
4650                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
4651                                                  MVT::i32)));
4652    }
4653    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
4654    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4655  }
4656
4657  if (VT == MVT::v8i8) {
4658    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
4659    if (NewOp.getNode())
4660      return NewOp;
4661  }
4662
4663  return SDValue();
4664}
4665
4666static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4667  // INSERT_VECTOR_ELT is legal only for immediate indexes.
4668  SDValue Lane = Op.getOperand(2);
4669  if (!isa<ConstantSDNode>(Lane))
4670    return SDValue();
4671
4672  return Op;
4673}
4674
4675static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4676  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
4677  SDValue Lane = Op.getOperand(1);
4678  if (!isa<ConstantSDNode>(Lane))
4679    return SDValue();
4680
4681  SDValue Vec = Op.getOperand(0);
4682  if (Op.getValueType() == MVT::i32 &&
4683      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
4684    DebugLoc dl = Op.getDebugLoc();
4685    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
4686  }
4687
4688  return Op;
4689}
4690
4691static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
4692  // The only time a CONCAT_VECTORS operation can have legal types is when
4693  // two 64-bit vectors are concatenated to a 128-bit vector.
4694  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
4695         "unexpected CONCAT_VECTORS");
4696  DebugLoc dl = Op.getDebugLoc();
4697  SDValue Val = DAG.getUNDEF(MVT::v2f64);
4698  SDValue Op0 = Op.getOperand(0);
4699  SDValue Op1 = Op.getOperand(1);
4700  if (Op0.getOpcode() != ISD::UNDEF)
4701    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
4702                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
4703                      DAG.getIntPtrConstant(0));
4704  if (Op1.getOpcode() != ISD::UNDEF)
4705    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
4706                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
4707                      DAG.getIntPtrConstant(1));
4708  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
4709}
4710
4711/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
4712/// element has been zero/sign-extended, depending on the isSigned parameter,
4713/// from an integer type half its size.
4714static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4715                                   bool isSigned) {
4716  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
4717  EVT VT = N->getValueType(0);
4718  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
4719    SDNode *BVN = N->getOperand(0).getNode();
4720    if (BVN->getValueType(0) != MVT::v4i32 ||
4721        BVN->getOpcode() != ISD::BUILD_VECTOR)
4722      return false;
4723    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
4724    unsigned HiElt = 1 - LoElt;
4725    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
4726    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
4727    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
4728    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
4729    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
4730      return false;
4731    if (isSigned) {
4732      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
4733          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
4734        return true;
4735    } else {
4736      if (Hi0->isNullValue() && Hi1->isNullValue())
4737        return true;
4738    }
4739    return false;
4740  }
4741
4742  if (N->getOpcode() != ISD::BUILD_VECTOR)
4743    return false;
4744
4745  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
4746    SDNode *Elt = N->getOperand(i).getNode();
4747    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4748      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4749      unsigned HalfSize = EltSize / 2;
4750      if (isSigned) {
4751        if (!isIntN(HalfSize, C->getSExtValue()))
4752          return false;
4753      } else {
4754        if (!isUIntN(HalfSize, C->getZExtValue()))
4755          return false;
4756      }
4757      continue;
4758    }
4759    return false;
4760  }
4761
4762  return true;
4763}
4764
4765/// isSignExtended - Check if a node is a vector value that is sign-extended
4766/// or a constant BUILD_VECTOR with sign-extended elements.
4767static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4768  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
4769    return true;
4770  if (isExtendedBUILD_VECTOR(N, DAG, true))
4771    return true;
4772  return false;
4773}
4774
4775/// isZeroExtended - Check if a node is a vector value that is zero-extended
4776/// or a constant BUILD_VECTOR with zero-extended elements.
4777static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4778  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
4779    return true;
4780  if (isExtendedBUILD_VECTOR(N, DAG, false))
4781    return true;
4782  return false;
4783}
4784
4785/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
4786/// load, or BUILD_VECTOR with extended elements, return the unextended value.
4787static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
4788  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
4789    return N->getOperand(0);
4790  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
4791    return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
4792                       LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
4793                       LD->isNonTemporal(), LD->isInvariant(),
4794                       LD->getAlignment());
4795  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
4796  // have been legalized as a BITCAST from v4i32.
4797  if (N->getOpcode() == ISD::BITCAST) {
4798    SDNode *BVN = N->getOperand(0).getNode();
4799    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
4800           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
4801    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
4802    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
4803                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
4804  }
4805  // Construct a new BUILD_VECTOR with elements truncated to half the size.
4806  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4807  EVT VT = N->getValueType(0);
4808  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
4809  unsigned NumElts = VT.getVectorNumElements();
4810  MVT TruncVT = MVT::getIntegerVT(EltSize);
4811  SmallVector<SDValue, 8> Ops;
4812  for (unsigned i = 0; i != NumElts; ++i) {
4813    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4814    const APInt &CInt = C->getAPIntValue();
4815    // Element types smaller than 32 bits are not legal, so use i32 elements.
4816    // The values are implicitly truncated so sext vs. zext doesn't matter.
4817    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
4818  }
4819  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
4820                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
4821}
4822
4823static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4824  unsigned Opcode = N->getOpcode();
4825  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4826    SDNode *N0 = N->getOperand(0).getNode();
4827    SDNode *N1 = N->getOperand(1).getNode();
4828    return N0->hasOneUse() && N1->hasOneUse() &&
4829      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4830  }
4831  return false;
4832}
4833
4834static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4835  unsigned Opcode = N->getOpcode();
4836  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4837    SDNode *N0 = N->getOperand(0).getNode();
4838    SDNode *N1 = N->getOperand(1).getNode();
4839    return N0->hasOneUse() && N1->hasOneUse() &&
4840      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4841  }
4842  return false;
4843}
4844
4845static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
4846  // Multiplications are only custom-lowered for 128-bit vectors so that
4847  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
4848  EVT VT = Op.getValueType();
4849  assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
4850  SDNode *N0 = Op.getOperand(0).getNode();
4851  SDNode *N1 = Op.getOperand(1).getNode();
4852  unsigned NewOpc = 0;
4853  bool isMLA = false;
4854  bool isN0SExt = isSignExtended(N0, DAG);
4855  bool isN1SExt = isSignExtended(N1, DAG);
4856  if (isN0SExt && isN1SExt)
4857    NewOpc = ARMISD::VMULLs;
4858  else {
4859    bool isN0ZExt = isZeroExtended(N0, DAG);
4860    bool isN1ZExt = isZeroExtended(N1, DAG);
4861    if (isN0ZExt && isN1ZExt)
4862      NewOpc = ARMISD::VMULLu;
4863    else if (isN1SExt || isN1ZExt) {
4864      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4865      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4866      if (isN1SExt && isAddSubSExt(N0, DAG)) {
4867        NewOpc = ARMISD::VMULLs;
4868        isMLA = true;
4869      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
4870        NewOpc = ARMISD::VMULLu;
4871        isMLA = true;
4872      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
4873        std::swap(N0, N1);
4874        NewOpc = ARMISD::VMULLu;
4875        isMLA = true;
4876      }
4877    }
4878
4879    if (!NewOpc) {
4880      if (VT == MVT::v2i64)
4881        // Fall through to expand this.  It is not legal.
4882        return SDValue();
4883      else
4884        // Other vector multiplications are legal.
4885        return Op;
4886    }
4887  }
4888
4889  // Legalize to a VMULL instruction.
4890  DebugLoc DL = Op.getDebugLoc();
4891  SDValue Op0;
4892  SDValue Op1 = SkipExtension(N1, DAG);
4893  if (!isMLA) {
4894    Op0 = SkipExtension(N0, DAG);
4895    assert(Op0.getValueType().is64BitVector() &&
4896           Op1.getValueType().is64BitVector() &&
4897           "unexpected types for extended operands to VMULL");
4898    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4899  }
4900
4901  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
4902  // isel lowering to take advantage of no-stall back to back vmul + vmla.
4903  //   vmull q0, d4, d6
4904  //   vmlal q0, d5, d6
4905  // is faster than
4906  //   vaddl q0, d4, d5
4907  //   vmovl q1, d6
4908  //   vmul  q0, q0, q1
4909  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
4910  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
4911  EVT Op1VT = Op1.getValueType();
4912  return DAG.getNode(N0->getOpcode(), DL, VT,
4913                     DAG.getNode(NewOpc, DL, VT,
4914                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4915                     DAG.getNode(NewOpc, DL, VT,
4916                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4917}
4918
4919static SDValue
4920LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
4921  // Convert to float
4922  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
4923  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
4924  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
4925  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
4926  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
4927  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
4928  // Get reciprocal estimate.
4929  // float4 recip = vrecpeq_f32(yf);
4930  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4931                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
4932  // Because char has a smaller range than uchar, we can actually get away
4933  // without any newton steps.  This requires that we use a weird bias
4934  // of 0xb000, however (again, this has been exhaustively tested).
4935  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
4936  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
4937  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
4938  Y = DAG.getConstant(0xb000, MVT::i32);
4939  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
4940  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
4941  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
4942  // Convert back to short.
4943  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
4944  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
4945  return X;
4946}
4947
4948static SDValue
4949LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
4950  SDValue N2;
4951  // Convert to float.
4952  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
4953  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
4954  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
4955  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
4956  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
4957  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
4958
4959  // Use reciprocal estimate and one refinement step.
4960  // float4 recip = vrecpeq_f32(yf);
4961  // recip *= vrecpsq_f32(yf, recip);
4962  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4963                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
4964  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4965                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
4966                   N1, N2);
4967  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
4968  // Because short has a smaller range than ushort, we can actually get away
4969  // with only a single newton step.  This requires that we use a weird bias
4970  // of 89, however (again, this has been exhaustively tested).
4971  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
4972  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
4973  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
4974  N1 = DAG.getConstant(0x89, MVT::i32);
4975  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
4976  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
4977  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
4978  // Convert back to integer and return.
4979  // return vmovn_s32(vcvt_s32_f32(result));
4980  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
4981  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
4982  return N0;
4983}
4984
4985static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
4986  EVT VT = Op.getValueType();
4987  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
4988         "unexpected type for custom-lowering ISD::SDIV");
4989
4990  DebugLoc dl = Op.getDebugLoc();
4991  SDValue N0 = Op.getOperand(0);
4992  SDValue N1 = Op.getOperand(1);
4993  SDValue N2, N3;
4994
4995  if (VT == MVT::v8i8) {
4996    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
4997    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
4998
4999    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5000                     DAG.getIntPtrConstant(4));
5001    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5002                     DAG.getIntPtrConstant(4));
5003    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5004                     DAG.getIntPtrConstant(0));
5005    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5006                     DAG.getIntPtrConstant(0));
5007
5008    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
5009    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
5010
5011    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5012    N0 = LowerCONCAT_VECTORS(N0, DAG);
5013
5014    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
5015    return N0;
5016  }
5017  return LowerSDIV_v4i16(N0, N1, dl, DAG);
5018}
5019
5020static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
5021  EVT VT = Op.getValueType();
5022  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
5023         "unexpected type for custom-lowering ISD::UDIV");
5024
5025  DebugLoc dl = Op.getDebugLoc();
5026  SDValue N0 = Op.getOperand(0);
5027  SDValue N1 = Op.getOperand(1);
5028  SDValue N2, N3;
5029
5030  if (VT == MVT::v8i8) {
5031    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
5032    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
5033
5034    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5035                     DAG.getIntPtrConstant(4));
5036    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5037                     DAG.getIntPtrConstant(4));
5038    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
5039                     DAG.getIntPtrConstant(0));
5040    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
5041                     DAG.getIntPtrConstant(0));
5042
5043    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
5044    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
5045
5046    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
5047    N0 = LowerCONCAT_VECTORS(N0, DAG);
5048
5049    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
5050                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
5051                     N0);
5052    return N0;
5053  }
5054
5055  // v4i16 sdiv ... Convert to float.
5056  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
5057  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
5058  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
5059  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
5060  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
5061  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
5062
5063  // Use reciprocal estimate and two refinement steps.
5064  // float4 recip = vrecpeq_f32(yf);
5065  // recip *= vrecpsq_f32(yf, recip);
5066  // recip *= vrecpsq_f32(yf, recip);
5067  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5068                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
5069  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5070                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5071                   BN1, N2);
5072  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5073  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
5074                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
5075                   BN1, N2);
5076  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
5077  // Simply multiplying by the reciprocal estimate can leave us a few ulps
5078  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
5079  // and that it will never cause us to return an answer too large).
5080  // float4 result = as_float4(as_int4(xf*recip) + 2);
5081  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
5082  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
5083  N1 = DAG.getConstant(2, MVT::i32);
5084  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
5085  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
5086  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
5087  // Convert back to integer and return.
5088  // return vmovn_u32(vcvt_s32_f32(result));
5089  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
5090  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
5091  return N0;
5092}
5093
5094static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
5095  EVT VT = Op.getNode()->getValueType(0);
5096  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5097
5098  unsigned Opc;
5099  bool ExtraOp = false;
5100  switch (Op.getOpcode()) {
5101  default: llvm_unreachable("Invalid code");
5102  case ISD::ADDC: Opc = ARMISD::ADDC; break;
5103  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
5104  case ISD::SUBC: Opc = ARMISD::SUBC; break;
5105  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
5106  }
5107
5108  if (!ExtraOp)
5109    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
5110                       Op.getOperand(1));
5111  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
5112                     Op.getOperand(1), Op.getOperand(2));
5113}
5114
5115static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
5116  // Monotonic load/store is legal for all targets
5117  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
5118    return Op;
5119
5120  // Aquire/Release load/store is not legal for targets without a
5121  // dmb or equivalent available.
5122  return SDValue();
5123}
5124
5125
5126static void
5127ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
5128                    SelectionDAG &DAG, unsigned NewOp) {
5129  DebugLoc dl = Node->getDebugLoc();
5130  assert (Node->getValueType(0) == MVT::i64 &&
5131          "Only know how to expand i64 atomics");
5132
5133  SmallVector<SDValue, 6> Ops;
5134  Ops.push_back(Node->getOperand(0)); // Chain
5135  Ops.push_back(Node->getOperand(1)); // Ptr
5136  // Low part of Val1
5137  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5138                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
5139  // High part of Val1
5140  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5141                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
5142  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
5143    // High part of Val1
5144    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5145                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
5146    // High part of Val2
5147    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5148                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
5149  }
5150  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
5151  SDValue Result =
5152    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
5153                            cast<MemSDNode>(Node)->getMemOperand());
5154  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
5155  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
5156  Results.push_back(Result.getValue(2));
5157}
5158
5159SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5160  switch (Op.getOpcode()) {
5161  default: llvm_unreachable("Don't know how to custom lower this!");
5162  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
5163  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
5164  case ISD::GlobalAddress:
5165    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
5166      LowerGlobalAddressELF(Op, DAG);
5167  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
5168  case ISD::SELECT:        return LowerSELECT(Op, DAG);
5169  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
5170  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
5171  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
5172  case ISD::VASTART:       return LowerVASTART(Op, DAG);
5173  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
5174  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
5175  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
5176  case ISD::SINT_TO_FP:
5177  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
5178  case ISD::FP_TO_SINT:
5179  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
5180  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
5181  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
5182  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
5183  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
5184  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
5185  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
5186  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
5187                                                               Subtarget);
5188  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
5189  case ISD::SHL:
5190  case ISD::SRL:
5191  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
5192  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
5193  case ISD::SRL_PARTS:
5194  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
5195  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
5196  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
5197  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
5198  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
5199  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
5200  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
5201  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5202  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
5203  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
5204  case ISD::MUL:           return LowerMUL(Op, DAG);
5205  case ISD::SDIV:          return LowerSDIV(Op, DAG);
5206  case ISD::UDIV:          return LowerUDIV(Op, DAG);
5207  case ISD::ADDC:
5208  case ISD::ADDE:
5209  case ISD::SUBC:
5210  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
5211  case ISD::ATOMIC_LOAD:
5212  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
5213  }
5214}
5215
5216/// ReplaceNodeResults - Replace the results of node with an illegal result
5217/// type with new values built out of custom code.
5218void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
5219                                           SmallVectorImpl<SDValue>&Results,
5220                                           SelectionDAG &DAG) const {
5221  SDValue Res;
5222  switch (N->getOpcode()) {
5223  default:
5224    llvm_unreachable("Don't know how to custom expand this!");
5225  case ISD::BITCAST:
5226    Res = ExpandBITCAST(N, DAG);
5227    break;
5228  case ISD::SRL:
5229  case ISD::SRA:
5230    Res = Expand64BitShift(N, DAG, Subtarget);
5231    break;
5232  case ISD::ATOMIC_LOAD_ADD:
5233    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
5234    return;
5235  case ISD::ATOMIC_LOAD_AND:
5236    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
5237    return;
5238  case ISD::ATOMIC_LOAD_NAND:
5239    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
5240    return;
5241  case ISD::ATOMIC_LOAD_OR:
5242    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
5243    return;
5244  case ISD::ATOMIC_LOAD_SUB:
5245    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
5246    return;
5247  case ISD::ATOMIC_LOAD_XOR:
5248    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
5249    return;
5250  case ISD::ATOMIC_SWAP:
5251    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
5252    return;
5253  case ISD::ATOMIC_CMP_SWAP:
5254    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
5255    return;
5256  }
5257  if (Res.getNode())
5258    Results.push_back(Res);
5259}
5260
5261//===----------------------------------------------------------------------===//
5262//                           ARM Scheduler Hooks
5263//===----------------------------------------------------------------------===//
5264
5265MachineBasicBlock *
5266ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
5267                                     MachineBasicBlock *BB,
5268                                     unsigned Size) const {
5269  unsigned dest    = MI->getOperand(0).getReg();
5270  unsigned ptr     = MI->getOperand(1).getReg();
5271  unsigned oldval  = MI->getOperand(2).getReg();
5272  unsigned newval  = MI->getOperand(3).getReg();
5273  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5274  DebugLoc dl = MI->getDebugLoc();
5275  bool isThumb2 = Subtarget->isThumb2();
5276
5277  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5278  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
5279    (const TargetRegisterClass*)&ARM::rGPRRegClass :
5280    (const TargetRegisterClass*)&ARM::GPRRegClass);
5281
5282  if (isThumb2) {
5283    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
5284    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
5285    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
5286  }
5287
5288  unsigned ldrOpc, strOpc;
5289  switch (Size) {
5290  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
5291  case 1:
5292    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
5293    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
5294    break;
5295  case 2:
5296    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
5297    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
5298    break;
5299  case 4:
5300    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
5301    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
5302    break;
5303  }
5304
5305  MachineFunction *MF = BB->getParent();
5306  const BasicBlock *LLVM_BB = BB->getBasicBlock();
5307  MachineFunction::iterator It = BB;
5308  ++It; // insert the new blocks after the current block
5309
5310  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
5311  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
5312  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5313  MF->insert(It, loop1MBB);
5314  MF->insert(It, loop2MBB);
5315  MF->insert(It, exitMBB);
5316
5317  // Transfer the remainder of BB and its successor edges to exitMBB.
5318  exitMBB->splice(exitMBB->begin(), BB,
5319                  llvm::next(MachineBasicBlock::iterator(MI)),
5320                  BB->end());
5321  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
5322
5323  //  thisMBB:
5324  //   ...
5325  //   fallthrough --> loop1MBB
5326  BB->addSuccessor(loop1MBB);
5327
5328  // loop1MBB:
5329  //   ldrex dest, [ptr]
5330  //   cmp dest, oldval
5331  //   bne exitMBB
5332  BB = loop1MBB;
5333  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
5334  if (ldrOpc == ARM::t2LDREX)
5335    MIB.addImm(0);
5336  AddDefaultPred(MIB);
5337  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
5338                 .addReg(dest).addReg(oldval));
5339  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5340    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5341  BB->addSuccessor(loop2MBB);
5342  BB->addSuccessor(exitMBB);
5343
5344  // loop2MBB:
5345  //   strex scratch, newval, [ptr]
5346  //   cmp scratch, #0
5347  //   bne loop1MBB
5348  BB = loop2MBB;
5349  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
5350  if (strOpc == ARM::t2STREX)
5351    MIB.addImm(0);
5352  AddDefaultPred(MIB);
5353  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5354                 .addReg(scratch).addImm(0));
5355  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5356    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5357  BB->addSuccessor(loop1MBB);
5358  BB->addSuccessor(exitMBB);
5359
5360  //  exitMBB:
5361  //   ...
5362  BB = exitMBB;
5363
5364  MI->eraseFromParent();   // The instruction is gone now.
5365
5366  return BB;
5367}
5368
5369MachineBasicBlock *
5370ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
5371                                    unsigned Size, unsigned BinOpcode) const {
5372  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
5373  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5374
5375  const BasicBlock *LLVM_BB = BB->getBasicBlock();
5376  MachineFunction *MF = BB->getParent();
5377  MachineFunction::iterator It = BB;
5378  ++It;
5379
5380  unsigned dest = MI->getOperand(0).getReg();
5381  unsigned ptr = MI->getOperand(1).getReg();
5382  unsigned incr = MI->getOperand(2).getReg();
5383  DebugLoc dl = MI->getDebugLoc();
5384  bool isThumb2 = Subtarget->isThumb2();
5385
5386  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5387  if (isThumb2) {
5388    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
5389    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
5390  }
5391
5392  unsigned ldrOpc, strOpc;
5393  switch (Size) {
5394  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
5395  case 1:
5396    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
5397    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
5398    break;
5399  case 2:
5400    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
5401    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
5402    break;
5403  case 4:
5404    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
5405    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
5406    break;
5407  }
5408
5409  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5410  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5411  MF->insert(It, loopMBB);
5412  MF->insert(It, exitMBB);
5413
5414  // Transfer the remainder of BB and its successor edges to exitMBB.
5415  exitMBB->splice(exitMBB->begin(), BB,
5416                  llvm::next(MachineBasicBlock::iterator(MI)),
5417                  BB->end());
5418  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
5419
5420  const TargetRegisterClass *TRC = isThumb2 ?
5421    (const TargetRegisterClass*)&ARM::rGPRRegClass :
5422    (const TargetRegisterClass*)&ARM::GPRRegClass;
5423  unsigned scratch = MRI.createVirtualRegister(TRC);
5424  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
5425
5426  //  thisMBB:
5427  //   ...
5428  //   fallthrough --> loopMBB
5429  BB->addSuccessor(loopMBB);
5430
5431  //  loopMBB:
5432  //   ldrex dest, ptr
5433  //   <binop> scratch2, dest, incr
5434  //   strex scratch, scratch2, ptr
5435  //   cmp scratch, #0
5436  //   bne- loopMBB
5437  //   fallthrough --> exitMBB
5438  BB = loopMBB;
5439  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
5440  if (ldrOpc == ARM::t2LDREX)
5441    MIB.addImm(0);
5442  AddDefaultPred(MIB);
5443  if (BinOpcode) {
5444    // operand order needs to go the other way for NAND
5445    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
5446      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
5447                     addReg(incr).addReg(dest)).addReg(0);
5448    else
5449      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
5450                     addReg(dest).addReg(incr)).addReg(0);
5451  }
5452
5453  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
5454  if (strOpc == ARM::t2STREX)
5455    MIB.addImm(0);
5456  AddDefaultPred(MIB);
5457  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5458                 .addReg(scratch).addImm(0));
5459  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5460    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5461
5462  BB->addSuccessor(loopMBB);
5463  BB->addSuccessor(exitMBB);
5464
5465  //  exitMBB:
5466  //   ...
5467  BB = exitMBB;
5468
5469  MI->eraseFromParent();   // The instruction is gone now.
5470
5471  return BB;
5472}
5473
5474MachineBasicBlock *
5475ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
5476                                          MachineBasicBlock *BB,
5477                                          unsigned Size,
5478                                          bool signExtend,
5479                                          ARMCC::CondCodes Cond) const {
5480  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5481
5482  const BasicBlock *LLVM_BB = BB->getBasicBlock();
5483  MachineFunction *MF = BB->getParent();
5484  MachineFunction::iterator It = BB;
5485  ++It;
5486
5487  unsigned dest = MI->getOperand(0).getReg();
5488  unsigned ptr = MI->getOperand(1).getReg();
5489  unsigned incr = MI->getOperand(2).getReg();
5490  unsigned oldval = dest;
5491  DebugLoc dl = MI->getDebugLoc();
5492  bool isThumb2 = Subtarget->isThumb2();
5493
5494  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5495  if (isThumb2) {
5496    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
5497    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
5498  }
5499
5500  unsigned ldrOpc, strOpc, extendOpc;
5501  switch (Size) {
5502  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
5503  case 1:
5504    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
5505    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
5506    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
5507    break;
5508  case 2:
5509    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
5510    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
5511    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
5512    break;
5513  case 4:
5514    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
5515    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
5516    extendOpc = 0;
5517    break;
5518  }
5519
5520  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5521  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5522  MF->insert(It, loopMBB);
5523  MF->insert(It, exitMBB);
5524
5525  // Transfer the remainder of BB and its successor edges to exitMBB.
5526  exitMBB->splice(exitMBB->begin(), BB,
5527                  llvm::next(MachineBasicBlock::iterator(MI)),
5528                  BB->end());
5529  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
5530
5531  const TargetRegisterClass *TRC = isThumb2 ?
5532    (const TargetRegisterClass*)&ARM::rGPRRegClass :
5533    (const TargetRegisterClass*)&ARM::GPRRegClass;
5534  unsigned scratch = MRI.createVirtualRegister(TRC);
5535  unsigned scratch2 = MRI.createVirtualRegister(TRC);
5536
5537  //  thisMBB:
5538  //   ...
5539  //   fallthrough --> loopMBB
5540  BB->addSuccessor(loopMBB);
5541
5542  //  loopMBB:
5543  //   ldrex dest, ptr
5544  //   (sign extend dest, if required)
5545  //   cmp dest, incr
5546  //   cmov.cond scratch2, dest, incr
5547  //   strex scratch, scratch2, ptr
5548  //   cmp scratch, #0
5549  //   bne- loopMBB
5550  //   fallthrough --> exitMBB
5551  BB = loopMBB;
5552  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
5553  if (ldrOpc == ARM::t2LDREX)
5554    MIB.addImm(0);
5555  AddDefaultPred(MIB);
5556
5557  // Sign extend the value, if necessary.
5558  if (signExtend && extendOpc) {
5559    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
5560    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
5561                     .addReg(dest)
5562                     .addImm(0));
5563  }
5564
5565  // Build compare and cmov instructions.
5566  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
5567                 .addReg(oldval).addReg(incr));
5568  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
5569         .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
5570
5571  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
5572  if (strOpc == ARM::t2STREX)
5573    MIB.addImm(0);
5574  AddDefaultPred(MIB);
5575  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5576                 .addReg(scratch).addImm(0));
5577  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5578    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5579
5580  BB->addSuccessor(loopMBB);
5581  BB->addSuccessor(exitMBB);
5582
5583  //  exitMBB:
5584  //   ...
5585  BB = exitMBB;
5586
5587  MI->eraseFromParent();   // The instruction is gone now.
5588
5589  return BB;
5590}
5591
5592MachineBasicBlock *
5593ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
5594                                      unsigned Op1, unsigned Op2,
5595                                      bool NeedsCarry, bool IsCmpxchg) const {
5596  // This also handles ATOMIC_SWAP, indicated by Op1==0.
5597  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5598
5599  const BasicBlock *LLVM_BB = BB->getBasicBlock();
5600  MachineFunction *MF = BB->getParent();
5601  MachineFunction::iterator It = BB;
5602  ++It;
5603
5604  unsigned destlo = MI->getOperand(0).getReg();
5605  unsigned desthi = MI->getOperand(1).getReg();
5606  unsigned ptr = MI->getOperand(2).getReg();
5607  unsigned vallo = MI->getOperand(3).getReg();
5608  unsigned valhi = MI->getOperand(4).getReg();
5609  DebugLoc dl = MI->getDebugLoc();
5610  bool isThumb2 = Subtarget->isThumb2();
5611
5612  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5613  if (isThumb2) {
5614    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
5615    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
5616    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
5617  }
5618
5619  unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
5620  unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD;
5621
5622  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5623  MachineBasicBlock *contBB = 0, *cont2BB = 0;
5624  if (IsCmpxchg) {
5625    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
5626    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
5627  }
5628  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
5629  MF->insert(It, loopMBB);
5630  if (IsCmpxchg) {
5631    MF->insert(It, contBB);
5632    MF->insert(It, cont2BB);
5633  }
5634  MF->insert(It, exitMBB);
5635
5636  // Transfer the remainder of BB and its successor edges to exitMBB.
5637  exitMBB->splice(exitMBB->begin(), BB,
5638                  llvm::next(MachineBasicBlock::iterator(MI)),
5639                  BB->end());
5640  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
5641
5642  const TargetRegisterClass *TRC = isThumb2 ?
5643    (const TargetRegisterClass*)&ARM::tGPRRegClass :
5644    (const TargetRegisterClass*)&ARM::GPRRegClass;
5645  unsigned storesuccess = MRI.createVirtualRegister(TRC);
5646
5647  //  thisMBB:
5648  //   ...
5649  //   fallthrough --> loopMBB
5650  BB->addSuccessor(loopMBB);
5651
5652  //  loopMBB:
5653  //   ldrexd r2, r3, ptr
5654  //   <binopa> r0, r2, incr
5655  //   <binopb> r1, r3, incr
5656  //   strexd storesuccess, r0, r1, ptr
5657  //   cmp storesuccess, #0
5658  //   bne- loopMBB
5659  //   fallthrough --> exitMBB
5660  //
5661  // Note that the registers are explicitly specified because there is not any
5662  // way to force the register allocator to allocate a register pair.
5663  //
5664  // FIXME: The hardcoded registers are not necessary for Thumb2, but we
5665  // need to properly enforce the restriction that the two output registers
5666  // for ldrexd must be different.
5667  BB = loopMBB;
5668  // Load
5669  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
5670                 .addReg(ARM::R2, RegState::Define)
5671                 .addReg(ARM::R3, RegState::Define).addReg(ptr));
5672  // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
5673  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2);
5674  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3);
5675
5676  if (IsCmpxchg) {
5677    // Add early exit
5678    for (unsigned i = 0; i < 2; i++) {
5679      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
5680                                                         ARM::CMPrr))
5681                     .addReg(i == 0 ? destlo : desthi)
5682                     .addReg(i == 0 ? vallo : valhi));
5683      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5684        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5685      BB->addSuccessor(exitMBB);
5686      BB->addSuccessor(i == 0 ? contBB : cont2BB);
5687      BB = (i == 0 ? contBB : cont2BB);
5688    }
5689
5690    // Copy to physregs for strexd
5691    unsigned setlo = MI->getOperand(5).getReg();
5692    unsigned sethi = MI->getOperand(6).getReg();
5693    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo);
5694    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi);
5695  } else if (Op1) {
5696    // Perform binary operation
5697    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0)
5698                   .addReg(destlo).addReg(vallo))
5699        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
5700    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1)
5701                   .addReg(desthi).addReg(valhi)).addReg(0);
5702  } else {
5703    // Copy to physregs for strexd
5704    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo);
5705    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi);
5706  }
5707
5708  // Store
5709  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
5710                 .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr));
5711  // Cmp+jump
5712  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5713                 .addReg(storesuccess).addImm(0));
5714  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5715    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
5716
5717  BB->addSuccessor(loopMBB);
5718  BB->addSuccessor(exitMBB);
5719
5720  //  exitMBB:
5721  //   ...
5722  BB = exitMBB;
5723
5724  MI->eraseFromParent();   // The instruction is gone now.
5725
5726  return BB;
5727}
5728
5729/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
5730/// registers the function context.
5731void ARMTargetLowering::
5732SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
5733                       MachineBasicBlock *DispatchBB, int FI) const {
5734  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5735  DebugLoc dl = MI->getDebugLoc();
5736  MachineFunction *MF = MBB->getParent();
5737  MachineRegisterInfo *MRI = &MF->getRegInfo();
5738  MachineConstantPool *MCP = MF->getConstantPool();
5739  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
5740  const Function *F = MF->getFunction();
5741
5742  bool isThumb = Subtarget->isThumb();
5743  bool isThumb2 = Subtarget->isThumb2();
5744
5745  unsigned PCLabelId = AFI->createPICLabelUId();
5746  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
5747  ARMConstantPoolValue *CPV =
5748    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
5749  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
5750
5751  const TargetRegisterClass *TRC = isThumb ?
5752    (const TargetRegisterClass*)&ARM::tGPRRegClass :
5753    (const TargetRegisterClass*)&ARM::GPRRegClass;
5754
5755  // Grab constant pool and fixed stack memory operands.
5756  MachineMemOperand *CPMMO =
5757    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
5758                             MachineMemOperand::MOLoad, 4, 4);
5759
5760  MachineMemOperand *FIMMOSt =
5761    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
5762                             MachineMemOperand::MOStore, 4, 4);
5763
5764  // Load the address of the dispatch MBB into the jump buffer.
5765  if (isThumb2) {
5766    // Incoming value: jbuf
5767    //   ldr.n  r5, LCPI1_1
5768    //   orr    r5, r5, #1
5769    //   add    r5, pc
5770    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
5771    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
5772    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
5773                   .addConstantPoolIndex(CPI)
5774                   .addMemOperand(CPMMO));
5775    // Set the low bit because of thumb mode.
5776    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
5777    AddDefaultCC(
5778      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
5779                     .addReg(NewVReg1, RegState::Kill)
5780                     .addImm(0x01)));
5781    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
5782    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
5783      .addReg(NewVReg2, RegState::Kill)
5784      .addImm(PCLabelId);
5785    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
5786                   .addReg(NewVReg3, RegState::Kill)
5787                   .addFrameIndex(FI)
5788                   .addImm(36)  // &jbuf[1] :: pc
5789                   .addMemOperand(FIMMOSt));
5790  } else if (isThumb) {
5791    // Incoming value: jbuf
5792    //   ldr.n  r1, LCPI1_4
5793    //   add    r1, pc
5794    //   mov    r2, #1
5795    //   orrs   r1, r2
5796    //   add    r2, $jbuf, #+4 ; &jbuf[1]
5797    //   str    r1, [r2]
5798    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
5799    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
5800                   .addConstantPoolIndex(CPI)
5801                   .addMemOperand(CPMMO));
5802    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
5803    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
5804      .addReg(NewVReg1, RegState::Kill)
5805      .addImm(PCLabelId);
5806    // Set the low bit because of thumb mode.
5807    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
5808    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
5809                   .addReg(ARM::CPSR, RegState::Define)
5810                   .addImm(1));
5811    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
5812    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
5813                   .addReg(ARM::CPSR, RegState::Define)
5814                   .addReg(NewVReg2, RegState::Kill)
5815                   .addReg(NewVReg3, RegState::Kill));
5816    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
5817    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
5818                   .addFrameIndex(FI)
5819                   .addImm(36)); // &jbuf[1] :: pc
5820    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
5821                   .addReg(NewVReg4, RegState::Kill)
5822                   .addReg(NewVReg5, RegState::Kill)
5823                   .addImm(0)
5824                   .addMemOperand(FIMMOSt));
5825  } else {
5826    // Incoming value: jbuf
5827    //   ldr  r1, LCPI1_1
5828    //   add  r1, pc, r1
5829    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
5830    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
5831    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
5832                   .addConstantPoolIndex(CPI)
5833                   .addImm(0)
5834                   .addMemOperand(CPMMO));
5835    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
5836    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
5837                   .addReg(NewVReg1, RegState::Kill)
5838                   .addImm(PCLabelId));
5839    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
5840                   .addReg(NewVReg2, RegState::Kill)
5841                   .addFrameIndex(FI)
5842                   .addImm(36)  // &jbuf[1] :: pc
5843                   .addMemOperand(FIMMOSt));
5844  }
5845}
5846
5847MachineBasicBlock *ARMTargetLowering::
5848EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
5849  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
5850  DebugLoc dl = MI->getDebugLoc();
5851  MachineFunction *MF = MBB->getParent();
5852  MachineRegisterInfo *MRI = &MF->getRegInfo();
5853  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
5854  MachineFrameInfo *MFI = MF->getFrameInfo();
5855  int FI = MFI->getFunctionContextIndex();
5856
5857  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
5858    (const TargetRegisterClass*)&ARM::tGPRRegClass :
5859    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
5860
5861  // Get a mapping of the call site numbers to all of the landing pads they're
5862  // associated with.
5863  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
5864  unsigned MaxCSNum = 0;
5865  MachineModuleInfo &MMI = MF->getMMI();
5866  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
5867       ++BB) {
5868    if (!BB->isLandingPad()) continue;
5869
5870    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
5871    // pad.
5872    for (MachineBasicBlock::iterator
5873           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
5874      if (!II->isEHLabel()) continue;
5875
5876      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
5877      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
5878
5879      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
5880      for (SmallVectorImpl<unsigned>::iterator
5881             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
5882           CSI != CSE; ++CSI) {
5883        CallSiteNumToLPad[*CSI].push_back(BB);
5884        MaxCSNum = std::max(MaxCSNum, *CSI);
5885      }
5886      break;
5887    }
5888  }
5889
5890  // Get an ordered list of the machine basic blocks for the jump table.
5891  std::vector<MachineBasicBlock*> LPadList;
5892  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
5893  LPadList.reserve(CallSiteNumToLPad.size());
5894  for (unsigned I = 1; I <= MaxCSNum; ++I) {
5895    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
5896    for (SmallVectorImpl<MachineBasicBlock*>::iterator
5897           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
5898      LPadList.push_back(*II);
5899      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
5900    }
5901  }
5902
5903  assert(!LPadList.empty() &&
5904         "No landing pad destinations for the dispatch jump table!");
5905
5906  // Create the jump table and associated information.
5907  MachineJumpTableInfo *JTI =
5908    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
5909  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
5910  unsigned UId = AFI->createJumpTableUId();
5911
5912  // Create the MBBs for the dispatch code.
5913
5914  // Shove the dispatch's address into the return slot in the function context.
5915  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
5916  DispatchBB->setIsLandingPad();
5917
5918  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5919  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
5920  DispatchBB->addSuccessor(TrapBB);
5921
5922  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
5923  DispatchBB->addSuccessor(DispContBB);
5924
5925  // Insert and MBBs.
5926  MF->insert(MF->end(), DispatchBB);
5927  MF->insert(MF->end(), DispContBB);
5928  MF->insert(MF->end(), TrapBB);
5929
5930  // Insert code into the entry block that creates and registers the function
5931  // context.
5932  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
5933
5934  MachineMemOperand *FIMMOLd =
5935    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
5936                             MachineMemOperand::MOLoad |
5937                             MachineMemOperand::MOVolatile, 4, 4);
5938
5939  if (AFI->isThumb1OnlyFunction())
5940    BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
5941  else if (!Subtarget->hasVFP2())
5942    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
5943  else
5944    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
5945
5946  unsigned NumLPads = LPadList.size();
5947  if (Subtarget->isThumb2()) {
5948    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
5949    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
5950                   .addFrameIndex(FI)
5951                   .addImm(4)
5952                   .addMemOperand(FIMMOLd));
5953
5954    if (NumLPads < 256) {
5955      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
5956                     .addReg(NewVReg1)
5957                     .addImm(LPadList.size()));
5958    } else {
5959      unsigned VReg1 = MRI->createVirtualRegister(TRC);
5960      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
5961                     .addImm(NumLPads & 0xFFFF));
5962
5963      unsigned VReg2 = VReg1;
5964      if ((NumLPads & 0xFFFF0000) != 0) {
5965        VReg2 = MRI->createVirtualRegister(TRC);
5966        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
5967                       .addReg(VReg1)
5968                       .addImm(NumLPads >> 16));
5969      }
5970
5971      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
5972                     .addReg(NewVReg1)
5973                     .addReg(VReg2));
5974    }
5975
5976    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
5977      .addMBB(TrapBB)
5978      .addImm(ARMCC::HI)
5979      .addReg(ARM::CPSR);
5980
5981    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
5982    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
5983                   .addJumpTableIndex(MJTI)
5984                   .addImm(UId));
5985
5986    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
5987    AddDefaultCC(
5988      AddDefaultPred(
5989        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
5990        .addReg(NewVReg3, RegState::Kill)
5991        .addReg(NewVReg1)
5992        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
5993
5994    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
5995      .addReg(NewVReg4, RegState::Kill)
5996      .addReg(NewVReg1)
5997      .addJumpTableIndex(MJTI)
5998      .addImm(UId);
5999  } else if (Subtarget->isThumb()) {
6000    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6001    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
6002                   .addFrameIndex(FI)
6003                   .addImm(1)
6004                   .addMemOperand(FIMMOLd));
6005
6006    if (NumLPads < 256) {
6007      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
6008                     .addReg(NewVReg1)
6009                     .addImm(NumLPads));
6010    } else {
6011      MachineConstantPool *ConstantPool = MF->getConstantPool();
6012      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6013      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6014
6015      // MachineConstantPool wants an explicit alignment.
6016      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
6017      if (Align == 0)
6018        Align = getTargetData()->getTypeAllocSize(C->getType());
6019      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6020
6021      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6022      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
6023                     .addReg(VReg1, RegState::Define)
6024                     .addConstantPoolIndex(Idx));
6025      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
6026                     .addReg(NewVReg1)
6027                     .addReg(VReg1));
6028    }
6029
6030    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
6031      .addMBB(TrapBB)
6032      .addImm(ARMCC::HI)
6033      .addReg(ARM::CPSR);
6034
6035    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
6036    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
6037                   .addReg(ARM::CPSR, RegState::Define)
6038                   .addReg(NewVReg1)
6039                   .addImm(2));
6040
6041    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6042    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
6043                   .addJumpTableIndex(MJTI)
6044                   .addImm(UId));
6045
6046    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6047    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
6048                   .addReg(ARM::CPSR, RegState::Define)
6049                   .addReg(NewVReg2, RegState::Kill)
6050                   .addReg(NewVReg3));
6051
6052    MachineMemOperand *JTMMOLd =
6053      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
6054                               MachineMemOperand::MOLoad, 4, 4);
6055
6056    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6057    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
6058                   .addReg(NewVReg4, RegState::Kill)
6059                   .addImm(0)
6060                   .addMemOperand(JTMMOLd));
6061
6062    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
6063    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
6064                   .addReg(ARM::CPSR, RegState::Define)
6065                   .addReg(NewVReg5, RegState::Kill)
6066                   .addReg(NewVReg3));
6067
6068    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
6069      .addReg(NewVReg6, RegState::Kill)
6070      .addJumpTableIndex(MJTI)
6071      .addImm(UId);
6072  } else {
6073    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
6074    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
6075                   .addFrameIndex(FI)
6076                   .addImm(4)
6077                   .addMemOperand(FIMMOLd));
6078
6079    if (NumLPads < 256) {
6080      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
6081                     .addReg(NewVReg1)
6082                     .addImm(NumLPads));
6083    } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
6084      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6085      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
6086                     .addImm(NumLPads & 0xFFFF));
6087
6088      unsigned VReg2 = VReg1;
6089      if ((NumLPads & 0xFFFF0000) != 0) {
6090        VReg2 = MRI->createVirtualRegister(TRC);
6091        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
6092                       .addReg(VReg1)
6093                       .addImm(NumLPads >> 16));
6094      }
6095
6096      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
6097                     .addReg(NewVReg1)
6098                     .addReg(VReg2));
6099    } else {
6100      MachineConstantPool *ConstantPool = MF->getConstantPool();
6101      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6102      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
6103
6104      // MachineConstantPool wants an explicit alignment.
6105      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
6106      if (Align == 0)
6107        Align = getTargetData()->getTypeAllocSize(C->getType());
6108      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6109
6110      unsigned VReg1 = MRI->createVirtualRegister(TRC);
6111      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
6112                     .addReg(VReg1, RegState::Define)
6113                     .addConstantPoolIndex(Idx)
6114                     .addImm(0));
6115      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
6116                     .addReg(NewVReg1)
6117                     .addReg(VReg1, RegState::Kill));
6118    }
6119
6120    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
6121      .addMBB(TrapBB)
6122      .addImm(ARMCC::HI)
6123      .addReg(ARM::CPSR);
6124
6125    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
6126    AddDefaultCC(
6127      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
6128                     .addReg(NewVReg1)
6129                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
6130    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
6131    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
6132                   .addJumpTableIndex(MJTI)
6133                   .addImm(UId));
6134
6135    MachineMemOperand *JTMMOLd =
6136      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
6137                               MachineMemOperand::MOLoad, 4, 4);
6138    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
6139    AddDefaultPred(
6140      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
6141      .addReg(NewVReg3, RegState::Kill)
6142      .addReg(NewVReg4)
6143      .addImm(0)
6144      .addMemOperand(JTMMOLd));
6145
6146    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
6147      .addReg(NewVReg5, RegState::Kill)
6148      .addReg(NewVReg4)
6149      .addJumpTableIndex(MJTI)
6150      .addImm(UId);
6151  }
6152
6153  // Add the jump table entries as successors to the MBB.
6154  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
6155  for (std::vector<MachineBasicBlock*>::iterator
6156         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
6157    MachineBasicBlock *CurMBB = *I;
6158    if (SeenMBBs.insert(CurMBB))
6159      DispContBB->addSuccessor(CurMBB);
6160  }
6161
6162  // N.B. the order the invoke BBs are processed in doesn't matter here.
6163  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
6164  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
6165  const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
6166  SmallVector<MachineBasicBlock*, 64> MBBLPads;
6167  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
6168         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
6169    MachineBasicBlock *BB = *I;
6170
6171    // Remove the landing pad successor from the invoke block and replace it
6172    // with the new dispatch block.
6173    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
6174                                                  BB->succ_end());
6175    while (!Successors.empty()) {
6176      MachineBasicBlock *SMBB = Successors.pop_back_val();
6177      if (SMBB->isLandingPad()) {
6178        BB->removeSuccessor(SMBB);
6179        MBBLPads.push_back(SMBB);
6180      }
6181    }
6182
6183    BB->addSuccessor(DispatchBB);
6184
6185    // Find the invoke call and mark all of the callee-saved registers as
6186    // 'implicit defined' so that they're spilled. This prevents code from
6187    // moving instructions to before the EH block, where they will never be
6188    // executed.
6189    for (MachineBasicBlock::reverse_iterator
6190           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
6191      if (!II->isCall()) continue;
6192
6193      DenseMap<unsigned, bool> DefRegs;
6194      for (MachineInstr::mop_iterator
6195             OI = II->operands_begin(), OE = II->operands_end();
6196           OI != OE; ++OI) {
6197        if (!OI->isReg()) continue;
6198        DefRegs[OI->getReg()] = true;
6199      }
6200
6201      MachineInstrBuilder MIB(&*II);
6202
6203      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
6204        unsigned Reg = SavedRegs[i];
6205        if (Subtarget->isThumb2() &&
6206            !ARM::tGPRRegClass.contains(Reg) &&
6207            !ARM::hGPRRegClass.contains(Reg))
6208          continue;
6209        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
6210          continue;
6211        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
6212          continue;
6213        if (!DefRegs[Reg])
6214          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
6215      }
6216
6217      break;
6218    }
6219  }
6220
6221  // Mark all former landing pads as non-landing pads. The dispatch is the only
6222  // landing pad now.
6223  for (SmallVectorImpl<MachineBasicBlock*>::iterator
6224         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
6225    (*I)->setIsLandingPad(false);
6226
6227  // The instruction is gone now.
6228  MI->eraseFromParent();
6229
6230  return MBB;
6231}
6232
6233static
6234MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
6235  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
6236       E = MBB->succ_end(); I != E; ++I)
6237    if (*I != Succ)
6238      return *I;
6239  llvm_unreachable("Expecting a BB with two successors!");
6240}
6241
6242MachineBasicBlock *ARMTargetLowering::
6243EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
6244  // This pseudo instruction has 3 operands: dst, src, size
6245  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
6246  // Otherwise, we will generate unrolled scalar copies.
6247  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6248  const BasicBlock *LLVM_BB = BB->getBasicBlock();
6249  MachineFunction::iterator It = BB;
6250  ++It;
6251
6252  unsigned dest = MI->getOperand(0).getReg();
6253  unsigned src = MI->getOperand(1).getReg();
6254  unsigned SizeVal = MI->getOperand(2).getImm();
6255  unsigned Align = MI->getOperand(3).getImm();
6256  DebugLoc dl = MI->getDebugLoc();
6257
6258  bool isThumb2 = Subtarget->isThumb2();
6259  MachineFunction *MF = BB->getParent();
6260  MachineRegisterInfo &MRI = MF->getRegInfo();
6261  unsigned ldrOpc, strOpc, UnitSize = 0;
6262
6263  const TargetRegisterClass *TRC = isThumb2 ?
6264    (const TargetRegisterClass*)&ARM::tGPRRegClass :
6265    (const TargetRegisterClass*)&ARM::GPRRegClass;
6266  const TargetRegisterClass *TRC_Vec = 0;
6267
6268  if (Align & 1) {
6269    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
6270    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
6271    UnitSize = 1;
6272  } else if (Align & 2) {
6273    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
6274    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
6275    UnitSize = 2;
6276  } else {
6277    // Check whether we can use NEON instructions.
6278    if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) &&
6279        Subtarget->hasNEON()) {
6280      if ((Align % 16 == 0) && SizeVal >= 16) {
6281        ldrOpc = ARM::VLD1q32wb_fixed;
6282        strOpc = ARM::VST1q32wb_fixed;
6283        UnitSize = 16;
6284        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
6285      }
6286      else if ((Align % 8 == 0) && SizeVal >= 8) {
6287        ldrOpc = ARM::VLD1d32wb_fixed;
6288        strOpc = ARM::VST1d32wb_fixed;
6289        UnitSize = 8;
6290        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
6291      }
6292    }
6293    // Can't use NEON instructions.
6294    if (UnitSize == 0) {
6295      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
6296      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
6297      UnitSize = 4;
6298    }
6299  }
6300
6301  unsigned BytesLeft = SizeVal % UnitSize;
6302  unsigned LoopSize = SizeVal - BytesLeft;
6303
6304  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
6305    // Use LDR and STR to copy.
6306    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
6307    // [destOut] = STR_POST(scratch, destIn, UnitSize)
6308    unsigned srcIn = src;
6309    unsigned destIn = dest;
6310    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
6311      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
6312      unsigned srcOut = MRI.createVirtualRegister(TRC);
6313      unsigned destOut = MRI.createVirtualRegister(TRC);
6314      if (UnitSize >= 8) {
6315        AddDefaultPred(BuildMI(*BB, MI, dl,
6316          TII->get(ldrOpc), scratch)
6317          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
6318
6319        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
6320          .addReg(destIn).addImm(0).addReg(scratch));
6321      } else if (isThumb2) {
6322        AddDefaultPred(BuildMI(*BB, MI, dl,
6323          TII->get(ldrOpc), scratch)
6324          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
6325
6326        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
6327          .addReg(scratch).addReg(destIn)
6328          .addImm(UnitSize));
6329      } else {
6330        AddDefaultPred(BuildMI(*BB, MI, dl,
6331          TII->get(ldrOpc), scratch)
6332          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
6333          .addImm(UnitSize));
6334
6335        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
6336          .addReg(scratch).addReg(destIn)
6337          .addReg(0).addImm(UnitSize));
6338      }
6339      srcIn = srcOut;
6340      destIn = destOut;
6341    }
6342
6343    // Handle the leftover bytes with LDRB and STRB.
6344    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
6345    // [destOut] = STRB_POST(scratch, destIn, 1)
6346    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
6347    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
6348    for (unsigned i = 0; i < BytesLeft; i++) {
6349      unsigned scratch = MRI.createVirtualRegister(TRC);
6350      unsigned srcOut = MRI.createVirtualRegister(TRC);
6351      unsigned destOut = MRI.createVirtualRegister(TRC);
6352      if (isThumb2) {
6353        AddDefaultPred(BuildMI(*BB, MI, dl,
6354          TII->get(ldrOpc),scratch)
6355          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
6356
6357        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
6358          .addReg(scratch).addReg(destIn)
6359          .addReg(0).addImm(1));
6360      } else {
6361        AddDefaultPred(BuildMI(*BB, MI, dl,
6362          TII->get(ldrOpc),scratch)
6363          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
6364
6365        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
6366          .addReg(scratch).addReg(destIn)
6367          .addReg(0).addImm(1));
6368      }
6369      srcIn = srcOut;
6370      destIn = destOut;
6371    }
6372    MI->eraseFromParent();   // The instruction is gone now.
6373    return BB;
6374  }
6375
6376  // Expand the pseudo op to a loop.
6377  // thisMBB:
6378  //   ...
6379  //   movw varEnd, # --> with thumb2
6380  //   movt varEnd, #
6381  //   ldrcp varEnd, idx --> without thumb2
6382  //   fallthrough --> loopMBB
6383  // loopMBB:
6384  //   PHI varPhi, varEnd, varLoop
6385  //   PHI srcPhi, src, srcLoop
6386  //   PHI destPhi, dst, destLoop
6387  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
6388  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
6389  //   subs varLoop, varPhi, #UnitSize
6390  //   bne loopMBB
6391  //   fallthrough --> exitMBB
6392  // exitMBB:
6393  //   epilogue to handle left-over bytes
6394  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
6395  //   [destOut] = STRB_POST(scratch, destLoop, 1)
6396  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6397  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
6398  MF->insert(It, loopMBB);
6399  MF->insert(It, exitMBB);
6400
6401  // Transfer the remainder of BB and its successor edges to exitMBB.
6402  exitMBB->splice(exitMBB->begin(), BB,
6403                  llvm::next(MachineBasicBlock::iterator(MI)),
6404                  BB->end());
6405  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
6406
6407  // Load an immediate to varEnd.
6408  unsigned varEnd = MRI.createVirtualRegister(TRC);
6409  if (isThumb2) {
6410    unsigned VReg1 = varEnd;
6411    if ((LoopSize & 0xFFFF0000) != 0)
6412      VReg1 = MRI.createVirtualRegister(TRC);
6413    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
6414                   .addImm(LoopSize & 0xFFFF));
6415
6416    if ((LoopSize & 0xFFFF0000) != 0)
6417      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
6418                     .addReg(VReg1)
6419                     .addImm(LoopSize >> 16));
6420  } else {
6421    MachineConstantPool *ConstantPool = MF->getConstantPool();
6422    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
6423    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
6424
6425    // MachineConstantPool wants an explicit alignment.
6426    unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
6427    if (Align == 0)
6428      Align = getTargetData()->getTypeAllocSize(C->getType());
6429    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
6430
6431    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
6432                   .addReg(varEnd, RegState::Define)
6433                   .addConstantPoolIndex(Idx)
6434                   .addImm(0));
6435  }
6436  BB->addSuccessor(loopMBB);
6437
6438  // Generate the loop body:
6439  //   varPhi = PHI(varLoop, varEnd)
6440  //   srcPhi = PHI(srcLoop, src)
6441  //   destPhi = PHI(destLoop, dst)
6442  MachineBasicBlock *entryBB = BB;
6443  BB = loopMBB;
6444  unsigned varLoop = MRI.createVirtualRegister(TRC);
6445  unsigned varPhi = MRI.createVirtualRegister(TRC);
6446  unsigned srcLoop = MRI.createVirtualRegister(TRC);
6447  unsigned srcPhi = MRI.createVirtualRegister(TRC);
6448  unsigned destLoop = MRI.createVirtualRegister(TRC);
6449  unsigned destPhi = MRI.createVirtualRegister(TRC);
6450
6451  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
6452    .addReg(varLoop).addMBB(loopMBB)
6453    .addReg(varEnd).addMBB(entryBB);
6454  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
6455    .addReg(srcLoop).addMBB(loopMBB)
6456    .addReg(src).addMBB(entryBB);
6457  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
6458    .addReg(destLoop).addMBB(loopMBB)
6459    .addReg(dest).addMBB(entryBB);
6460
6461  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
6462  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
6463  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
6464  if (UnitSize >= 8) {
6465    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
6466      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
6467
6468    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
6469      .addReg(destPhi).addImm(0).addReg(scratch));
6470  } else if (isThumb2) {
6471    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
6472      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
6473
6474    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
6475      .addReg(scratch).addReg(destPhi)
6476      .addImm(UnitSize));
6477  } else {
6478    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
6479      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
6480      .addImm(UnitSize));
6481
6482    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
6483      .addReg(scratch).addReg(destPhi)
6484      .addReg(0).addImm(UnitSize));
6485  }
6486
6487  // Decrement loop variable by UnitSize.
6488  MachineInstrBuilder MIB = BuildMI(BB, dl,
6489    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
6490  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
6491  MIB->getOperand(5).setReg(ARM::CPSR);
6492  MIB->getOperand(5).setIsDef(true);
6493
6494  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6495    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
6496
6497  // loopMBB can loop back to loopMBB or fall through to exitMBB.
6498  BB->addSuccessor(loopMBB);
6499  BB->addSuccessor(exitMBB);
6500
6501  // Add epilogue to handle BytesLeft.
6502  BB = exitMBB;
6503  MachineInstr *StartOfExit = exitMBB->begin();
6504  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
6505  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
6506
6507  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
6508  //   [destOut] = STRB_POST(scratch, destLoop, 1)
6509  unsigned srcIn = srcLoop;
6510  unsigned destIn = destLoop;
6511  for (unsigned i = 0; i < BytesLeft; i++) {
6512    unsigned scratch = MRI.createVirtualRegister(TRC);
6513    unsigned srcOut = MRI.createVirtualRegister(TRC);
6514    unsigned destOut = MRI.createVirtualRegister(TRC);
6515    if (isThumb2) {
6516      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
6517        TII->get(ldrOpc),scratch)
6518        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
6519
6520      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
6521        .addReg(scratch).addReg(destIn)
6522        .addImm(1));
6523    } else {
6524      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
6525        TII->get(ldrOpc),scratch)
6526        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
6527
6528      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
6529        .addReg(scratch).addReg(destIn)
6530        .addReg(0).addImm(1));
6531    }
6532    srcIn = srcOut;
6533    destIn = destOut;
6534  }
6535
6536  MI->eraseFromParent();   // The instruction is gone now.
6537  return BB;
6538}
6539
6540MachineBasicBlock *
6541ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
6542                                               MachineBasicBlock *BB) const {
6543  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
6544  DebugLoc dl = MI->getDebugLoc();
6545  bool isThumb2 = Subtarget->isThumb2();
6546  switch (MI->getOpcode()) {
6547  default: {
6548    MI->dump();
6549    llvm_unreachable("Unexpected instr type to insert");
6550  }
6551  // The Thumb2 pre-indexed stores have the same MI operands, they just
6552  // define them differently in the .td files from the isel patterns, so
6553  // they need pseudos.
6554  case ARM::t2STR_preidx:
6555    MI->setDesc(TII->get(ARM::t2STR_PRE));
6556    return BB;
6557  case ARM::t2STRB_preidx:
6558    MI->setDesc(TII->get(ARM::t2STRB_PRE));
6559    return BB;
6560  case ARM::t2STRH_preidx:
6561    MI->setDesc(TII->get(ARM::t2STRH_PRE));
6562    return BB;
6563
6564  case ARM::STRi_preidx:
6565  case ARM::STRBi_preidx: {
6566    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
6567      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
6568    // Decode the offset.
6569    unsigned Offset = MI->getOperand(4).getImm();
6570    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
6571    Offset = ARM_AM::getAM2Offset(Offset);
6572    if (isSub)
6573      Offset = -Offset;
6574
6575    MachineMemOperand *MMO = *MI->memoperands_begin();
6576    BuildMI(*BB, MI, dl, TII->get(NewOpc))
6577      .addOperand(MI->getOperand(0))  // Rn_wb
6578      .addOperand(MI->getOperand(1))  // Rt
6579      .addOperand(MI->getOperand(2))  // Rn
6580      .addImm(Offset)                 // offset (skip GPR==zero_reg)
6581      .addOperand(MI->getOperand(5))  // pred
6582      .addOperand(MI->getOperand(6))
6583      .addMemOperand(MMO);
6584    MI->eraseFromParent();
6585    return BB;
6586  }
6587  case ARM::STRr_preidx:
6588  case ARM::STRBr_preidx:
6589  case ARM::STRH_preidx: {
6590    unsigned NewOpc;
6591    switch (MI->getOpcode()) {
6592    default: llvm_unreachable("unexpected opcode!");
6593    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
6594    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
6595    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
6596    }
6597    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
6598    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
6599      MIB.addOperand(MI->getOperand(i));
6600    MI->eraseFromParent();
6601    return BB;
6602  }
6603  case ARM::ATOMIC_LOAD_ADD_I8:
6604     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
6605  case ARM::ATOMIC_LOAD_ADD_I16:
6606     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
6607  case ARM::ATOMIC_LOAD_ADD_I32:
6608     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
6609
6610  case ARM::ATOMIC_LOAD_AND_I8:
6611     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
6612  case ARM::ATOMIC_LOAD_AND_I16:
6613     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
6614  case ARM::ATOMIC_LOAD_AND_I32:
6615     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
6616
6617  case ARM::ATOMIC_LOAD_OR_I8:
6618     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
6619  case ARM::ATOMIC_LOAD_OR_I16:
6620     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
6621  case ARM::ATOMIC_LOAD_OR_I32:
6622     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
6623
6624  case ARM::ATOMIC_LOAD_XOR_I8:
6625     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
6626  case ARM::ATOMIC_LOAD_XOR_I16:
6627     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
6628  case ARM::ATOMIC_LOAD_XOR_I32:
6629     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
6630
6631  case ARM::ATOMIC_LOAD_NAND_I8:
6632     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
6633  case ARM::ATOMIC_LOAD_NAND_I16:
6634     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
6635  case ARM::ATOMIC_LOAD_NAND_I32:
6636     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
6637
6638  case ARM::ATOMIC_LOAD_SUB_I8:
6639     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
6640  case ARM::ATOMIC_LOAD_SUB_I16:
6641     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
6642  case ARM::ATOMIC_LOAD_SUB_I32:
6643     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
6644
6645  case ARM::ATOMIC_LOAD_MIN_I8:
6646     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
6647  case ARM::ATOMIC_LOAD_MIN_I16:
6648     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
6649  case ARM::ATOMIC_LOAD_MIN_I32:
6650     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
6651
6652  case ARM::ATOMIC_LOAD_MAX_I8:
6653     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
6654  case ARM::ATOMIC_LOAD_MAX_I16:
6655     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
6656  case ARM::ATOMIC_LOAD_MAX_I32:
6657     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
6658
6659  case ARM::ATOMIC_LOAD_UMIN_I8:
6660     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
6661  case ARM::ATOMIC_LOAD_UMIN_I16:
6662     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
6663  case ARM::ATOMIC_LOAD_UMIN_I32:
6664     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
6665
6666  case ARM::ATOMIC_LOAD_UMAX_I8:
6667     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
6668  case ARM::ATOMIC_LOAD_UMAX_I16:
6669     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
6670  case ARM::ATOMIC_LOAD_UMAX_I32:
6671     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
6672
6673  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
6674  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
6675  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
6676
6677  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
6678  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
6679  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
6680
6681
6682  case ARM::ATOMADD6432:
6683    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
6684                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
6685                              /*NeedsCarry*/ true);
6686  case ARM::ATOMSUB6432:
6687    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
6688                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
6689                              /*NeedsCarry*/ true);
6690  case ARM::ATOMOR6432:
6691    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
6692                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
6693  case ARM::ATOMXOR6432:
6694    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
6695                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
6696  case ARM::ATOMAND6432:
6697    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
6698                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
6699  case ARM::ATOMSWAP6432:
6700    return EmitAtomicBinary64(MI, BB, 0, 0, false);
6701  case ARM::ATOMCMPXCHG6432:
6702    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
6703                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
6704                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
6705
6706  case ARM::tMOVCCr_pseudo: {
6707    // To "insert" a SELECT_CC instruction, we actually have to insert the
6708    // diamond control-flow pattern.  The incoming instruction knows the
6709    // destination vreg to set, the condition code register to branch on, the
6710    // true/false values to select between, and a branch opcode to use.
6711    const BasicBlock *LLVM_BB = BB->getBasicBlock();
6712    MachineFunction::iterator It = BB;
6713    ++It;
6714
6715    //  thisMBB:
6716    //  ...
6717    //   TrueVal = ...
6718    //   cmpTY ccX, r1, r2
6719    //   bCC copy1MBB
6720    //   fallthrough --> copy0MBB
6721    MachineBasicBlock *thisMBB  = BB;
6722    MachineFunction *F = BB->getParent();
6723    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
6724    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
6725    F->insert(It, copy0MBB);
6726    F->insert(It, sinkMBB);
6727
6728    // Transfer the remainder of BB and its successor edges to sinkMBB.
6729    sinkMBB->splice(sinkMBB->begin(), BB,
6730                    llvm::next(MachineBasicBlock::iterator(MI)),
6731                    BB->end());
6732    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
6733
6734    BB->addSuccessor(copy0MBB);
6735    BB->addSuccessor(sinkMBB);
6736
6737    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
6738      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
6739
6740    //  copy0MBB:
6741    //   %FalseValue = ...
6742    //   # fallthrough to sinkMBB
6743    BB = copy0MBB;
6744
6745    // Update machine-CFG edges
6746    BB->addSuccessor(sinkMBB);
6747
6748    //  sinkMBB:
6749    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
6750    //  ...
6751    BB = sinkMBB;
6752    BuildMI(*BB, BB->begin(), dl,
6753            TII->get(ARM::PHI), MI->getOperand(0).getReg())
6754      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
6755      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
6756
6757    MI->eraseFromParent();   // The pseudo instruction is gone now.
6758    return BB;
6759  }
6760
6761  case ARM::BCCi64:
6762  case ARM::BCCZi64: {
6763    // If there is an unconditional branch to the other successor, remove it.
6764    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
6765
6766    // Compare both parts that make up the double comparison separately for
6767    // equality.
6768    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
6769
6770    unsigned LHS1 = MI->getOperand(1).getReg();
6771    unsigned LHS2 = MI->getOperand(2).getReg();
6772    if (RHSisZero) {
6773      AddDefaultPred(BuildMI(BB, dl,
6774                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6775                     .addReg(LHS1).addImm(0));
6776      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6777        .addReg(LHS2).addImm(0)
6778        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
6779    } else {
6780      unsigned RHS1 = MI->getOperand(3).getReg();
6781      unsigned RHS2 = MI->getOperand(4).getReg();
6782      AddDefaultPred(BuildMI(BB, dl,
6783                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6784                     .addReg(LHS1).addReg(RHS1));
6785      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
6786        .addReg(LHS2).addReg(RHS2)
6787        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
6788    }
6789
6790    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
6791    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
6792    if (MI->getOperand(0).getImm() == ARMCC::NE)
6793      std::swap(destMBB, exitMBB);
6794
6795    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
6796      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
6797    if (isThumb2)
6798      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
6799    else
6800      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
6801
6802    MI->eraseFromParent();   // The pseudo instruction is gone now.
6803    return BB;
6804  }
6805
6806  case ARM::Int_eh_sjlj_setjmp:
6807  case ARM::Int_eh_sjlj_setjmp_nofp:
6808  case ARM::tInt_eh_sjlj_setjmp:
6809  case ARM::t2Int_eh_sjlj_setjmp:
6810  case ARM::t2Int_eh_sjlj_setjmp_nofp:
6811    EmitSjLjDispatchBlock(MI, BB);
6812    return BB;
6813
6814  case ARM::ABS:
6815  case ARM::t2ABS: {
6816    // To insert an ABS instruction, we have to insert the
6817    // diamond control-flow pattern.  The incoming instruction knows the
6818    // source vreg to test against 0, the destination vreg to set,
6819    // the condition code register to branch on, the
6820    // true/false values to select between, and a branch opcode to use.
6821    // It transforms
6822    //     V1 = ABS V0
6823    // into
6824    //     V2 = MOVS V0
6825    //     BCC                      (branch to SinkBB if V0 >= 0)
6826    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
6827    //     SinkBB: V1 = PHI(V2, V3)
6828    const BasicBlock *LLVM_BB = BB->getBasicBlock();
6829    MachineFunction::iterator BBI = BB;
6830    ++BBI;
6831    MachineFunction *Fn = BB->getParent();
6832    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
6833    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
6834    Fn->insert(BBI, RSBBB);
6835    Fn->insert(BBI, SinkBB);
6836
6837    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
6838    unsigned int ABSDstReg = MI->getOperand(0).getReg();
6839    bool isThumb2 = Subtarget->isThumb2();
6840    MachineRegisterInfo &MRI = Fn->getRegInfo();
6841    // In Thumb mode S must not be specified if source register is the SP or
6842    // PC and if destination register is the SP, so restrict register class
6843    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
6844      (const TargetRegisterClass*)&ARM::rGPRRegClass :
6845      (const TargetRegisterClass*)&ARM::GPRRegClass);
6846
6847    // Transfer the remainder of BB and its successor edges to sinkMBB.
6848    SinkBB->splice(SinkBB->begin(), BB,
6849      llvm::next(MachineBasicBlock::iterator(MI)),
6850      BB->end());
6851    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
6852
6853    BB->addSuccessor(RSBBB);
6854    BB->addSuccessor(SinkBB);
6855
6856    // fall through to SinkMBB
6857    RSBBB->addSuccessor(SinkBB);
6858
6859    // insert a cmp at the end of BB
6860    AddDefaultPred(BuildMI(BB, dl,
6861                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
6862                   .addReg(ABSSrcReg).addImm(0));
6863
6864    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
6865    BuildMI(BB, dl,
6866      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
6867      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
6868
6869    // insert rsbri in RSBBB
6870    // Note: BCC and rsbri will be converted into predicated rsbmi
6871    // by if-conversion pass
6872    BuildMI(*RSBBB, RSBBB->begin(), dl,
6873      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
6874      .addReg(ABSSrcReg, RegState::Kill)
6875      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
6876
6877    // insert PHI in SinkBB,
6878    // reuse ABSDstReg to not change uses of ABS instruction
6879    BuildMI(*SinkBB, SinkBB->begin(), dl,
6880      TII->get(ARM::PHI), ABSDstReg)
6881      .addReg(NewRsbDstReg).addMBB(RSBBB)
6882      .addReg(ABSSrcReg).addMBB(BB);
6883
6884    // remove ABS instruction
6885    MI->eraseFromParent();
6886
6887    // return last added BB
6888    return SinkBB;
6889  }
6890  case ARM::COPY_STRUCT_BYVAL_I32:
6891    ++NumLoopByVals;
6892    return EmitStructByval(MI, BB);
6893  }
6894}
6895
6896void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
6897                                                      SDNode *Node) const {
6898  if (!MI->hasPostISelHook()) {
6899    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
6900           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
6901    return;
6902  }
6903
6904  const MCInstrDesc *MCID = &MI->getDesc();
6905  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
6906  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
6907  // operand is still set to noreg. If needed, set the optional operand's
6908  // register to CPSR, and remove the redundant implicit def.
6909  //
6910  // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
6911
6912  // Rename pseudo opcodes.
6913  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
6914  if (NewOpc) {
6915    const ARMBaseInstrInfo *TII =
6916      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
6917    MCID = &TII->get(NewOpc);
6918
6919    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
6920           "converted opcode should be the same except for cc_out");
6921
6922    MI->setDesc(*MCID);
6923
6924    // Add the optional cc_out operand
6925    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
6926  }
6927  unsigned ccOutIdx = MCID->getNumOperands() - 1;
6928
6929  // Any ARM instruction that sets the 's' bit should specify an optional
6930  // "cc_out" operand in the last operand position.
6931  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
6932    assert(!NewOpc && "Optional cc_out operand required");
6933    return;
6934  }
6935  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
6936  // since we already have an optional CPSR def.
6937  bool definesCPSR = false;
6938  bool deadCPSR = false;
6939  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
6940       i != e; ++i) {
6941    const MachineOperand &MO = MI->getOperand(i);
6942    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
6943      definesCPSR = true;
6944      if (MO.isDead())
6945        deadCPSR = true;
6946      MI->RemoveOperand(i);
6947      break;
6948    }
6949  }
6950  if (!definesCPSR) {
6951    assert(!NewOpc && "Optional cc_out operand required");
6952    return;
6953  }
6954  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
6955  if (deadCPSR) {
6956    assert(!MI->getOperand(ccOutIdx).getReg() &&
6957           "expect uninitialized optional cc_out operand");
6958    return;
6959  }
6960
6961  // If this instruction was defined with an optional CPSR def and its dag node
6962  // had a live implicit CPSR def, then activate the optional CPSR def.
6963  MachineOperand &MO = MI->getOperand(ccOutIdx);
6964  MO.setReg(ARM::CPSR);
6965  MO.setIsDef(true);
6966}
6967
6968//===----------------------------------------------------------------------===//
6969//                           ARM Optimization Hooks
6970//===----------------------------------------------------------------------===//
6971
6972// Helper function that checks if N is a null or all ones constant.
6973static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
6974  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
6975  if (!C)
6976    return false;
6977  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
6978}
6979
6980// Return true if N is conditionally 0 or all ones.
6981// Detects these expressions where cc is an i1 value:
6982//
6983//   (select cc 0, y)   [AllOnes=0]
6984//   (select cc y, 0)   [AllOnes=0]
6985//   (zext cc)          [AllOnes=0]
6986//   (sext cc)          [AllOnes=0/1]
6987//   (select cc -1, y)  [AllOnes=1]
6988//   (select cc y, -1)  [AllOnes=1]
6989//
6990// Invert is set when N is the null/all ones constant when CC is false.
6991// OtherOp is set to the alternative value of N.
6992static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
6993                                       SDValue &CC, bool &Invert,
6994                                       SDValue &OtherOp,
6995                                       SelectionDAG &DAG) {
6996  switch (N->getOpcode()) {
6997  default: return false;
6998  case ISD::SELECT: {
6999    CC = N->getOperand(0);
7000    SDValue N1 = N->getOperand(1);
7001    SDValue N2 = N->getOperand(2);
7002    if (isZeroOrAllOnes(N1, AllOnes)) {
7003      Invert = false;
7004      OtherOp = N2;
7005      return true;
7006    }
7007    if (isZeroOrAllOnes(N2, AllOnes)) {
7008      Invert = true;
7009      OtherOp = N1;
7010      return true;
7011    }
7012    return false;
7013  }
7014  case ISD::ZERO_EXTEND:
7015    // (zext cc) can never be the all ones value.
7016    if (AllOnes)
7017      return false;
7018    // Fall through.
7019  case ISD::SIGN_EXTEND: {
7020    EVT VT = N->getValueType(0);
7021    CC = N->getOperand(0);
7022    if (CC.getValueType() != MVT::i1)
7023      return false;
7024    Invert = !AllOnes;
7025    if (AllOnes)
7026      // When looking for an AllOnes constant, N is an sext, and the 'other'
7027      // value is 0.
7028      OtherOp = DAG.getConstant(0, VT);
7029    else if (N->getOpcode() == ISD::ZERO_EXTEND)
7030      // When looking for a 0 constant, N can be zext or sext.
7031      OtherOp = DAG.getConstant(1, VT);
7032    else
7033      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
7034    return true;
7035  }
7036  }
7037}
7038
7039// Combine a constant select operand into its use:
7040//
7041//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
7042//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
7043//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
7044//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
7045//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
7046//
7047// The transform is rejected if the select doesn't have a constant operand that
7048// is null, or all ones when AllOnes is set.
7049//
7050// Also recognize sext/zext from i1:
7051//
7052//   (add (zext cc), x) -> (select cc (add x, 1), x)
7053//   (add (sext cc), x) -> (select cc (add x, -1), x)
7054//
7055// These transformations eventually create predicated instructions.
7056//
7057// @param N       The node to transform.
7058// @param Slct    The N operand that is a select.
7059// @param OtherOp The other N operand (x above).
7060// @param DCI     Context.
7061// @param AllOnes Require the select constant to be all ones instead of null.
7062// @returns The new node, or SDValue() on failure.
7063static
7064SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
7065                            TargetLowering::DAGCombinerInfo &DCI,
7066                            bool AllOnes = false) {
7067  SelectionDAG &DAG = DCI.DAG;
7068  EVT VT = N->getValueType(0);
7069  SDValue NonConstantVal;
7070  SDValue CCOp;
7071  bool SwapSelectOps;
7072  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
7073                                  NonConstantVal, DAG))
7074    return SDValue();
7075
7076  // Slct is now know to be the desired identity constant when CC is true.
7077  SDValue TrueVal = OtherOp;
7078  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
7079                                 OtherOp, NonConstantVal);
7080  // Unless SwapSelectOps says CC should be false.
7081  if (SwapSelectOps)
7082    std::swap(TrueVal, FalseVal);
7083
7084  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
7085                     CCOp, TrueVal, FalseVal);
7086}
7087
7088// Attempt combineSelectAndUse on each operand of a commutative operator N.
7089static
7090SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
7091                                       TargetLowering::DAGCombinerInfo &DCI) {
7092  SDValue N0 = N->getOperand(0);
7093  SDValue N1 = N->getOperand(1);
7094  if (N0.getNode()->hasOneUse()) {
7095    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
7096    if (Result.getNode())
7097      return Result;
7098  }
7099  if (N1.getNode()->hasOneUse()) {
7100    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
7101    if (Result.getNode())
7102      return Result;
7103  }
7104  return SDValue();
7105}
7106
7107// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
7108// (only after legalization).
7109static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
7110                                 TargetLowering::DAGCombinerInfo &DCI,
7111                                 const ARMSubtarget *Subtarget) {
7112
7113  // Only perform optimization if after legalize, and if NEON is available. We
7114  // also expected both operands to be BUILD_VECTORs.
7115  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
7116      || N0.getOpcode() != ISD::BUILD_VECTOR
7117      || N1.getOpcode() != ISD::BUILD_VECTOR)
7118    return SDValue();
7119
7120  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
7121  EVT VT = N->getValueType(0);
7122  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
7123    return SDValue();
7124
7125  // Check that the vector operands are of the right form.
7126  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
7127  // operands, where N is the size of the formed vector.
7128  // Each EXTRACT_VECTOR should have the same input vector and odd or even
7129  // index such that we have a pair wise add pattern.
7130
7131  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
7132  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7133    return SDValue();
7134  SDValue Vec = N0->getOperand(0)->getOperand(0);
7135  SDNode *V = Vec.getNode();
7136  unsigned nextIndex = 0;
7137
7138  // For each operands to the ADD which are BUILD_VECTORs,
7139  // check to see if each of their operands are an EXTRACT_VECTOR with
7140  // the same vector and appropriate index.
7141  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
7142    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
7143        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
7144
7145      SDValue ExtVec0 = N0->getOperand(i);
7146      SDValue ExtVec1 = N1->getOperand(i);
7147
7148      // First operand is the vector, verify its the same.
7149      if (V != ExtVec0->getOperand(0).getNode() ||
7150          V != ExtVec1->getOperand(0).getNode())
7151        return SDValue();
7152
7153      // Second is the constant, verify its correct.
7154      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
7155      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
7156
7157      // For the constant, we want to see all the even or all the odd.
7158      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
7159          || C1->getZExtValue() != nextIndex+1)
7160        return SDValue();
7161
7162      // Increment index.
7163      nextIndex+=2;
7164    } else
7165      return SDValue();
7166  }
7167
7168  // Create VPADDL node.
7169  SelectionDAG &DAG = DCI.DAG;
7170  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7171
7172  // Build operand list.
7173  SmallVector<SDValue, 8> Ops;
7174  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
7175                                TLI.getPointerTy()));
7176
7177  // Input is the vector.
7178  Ops.push_back(Vec);
7179
7180  // Get widened type and narrowed type.
7181  MVT widenType;
7182  unsigned numElem = VT.getVectorNumElements();
7183  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7184    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
7185    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
7186    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
7187    default:
7188      llvm_unreachable("Invalid vector element type for padd optimization.");
7189  }
7190
7191  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
7192                            widenType, &Ops[0], Ops.size());
7193  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
7194}
7195
7196/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
7197/// operands N0 and N1.  This is a helper for PerformADDCombine that is
7198/// called with the default operands, and if that fails, with commuted
7199/// operands.
7200static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
7201                                          TargetLowering::DAGCombinerInfo &DCI,
7202                                          const ARMSubtarget *Subtarget){
7203
7204  // Attempt to create vpaddl for this add.
7205  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
7206  if (Result.getNode())
7207    return Result;
7208
7209  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
7210  if (N0.getNode()->hasOneUse()) {
7211    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
7212    if (Result.getNode()) return Result;
7213  }
7214  return SDValue();
7215}
7216
7217/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
7218///
7219static SDValue PerformADDCombine(SDNode *N,
7220                                 TargetLowering::DAGCombinerInfo &DCI,
7221                                 const ARMSubtarget *Subtarget) {
7222  SDValue N0 = N->getOperand(0);
7223  SDValue N1 = N->getOperand(1);
7224
7225  // First try with the default operand order.
7226  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
7227  if (Result.getNode())
7228    return Result;
7229
7230  // If that didn't work, try again with the operands commuted.
7231  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
7232}
7233
7234/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
7235///
7236static SDValue PerformSUBCombine(SDNode *N,
7237                                 TargetLowering::DAGCombinerInfo &DCI) {
7238  SDValue N0 = N->getOperand(0);
7239  SDValue N1 = N->getOperand(1);
7240
7241  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
7242  if (N1.getNode()->hasOneUse()) {
7243    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
7244    if (Result.getNode()) return Result;
7245  }
7246
7247  return SDValue();
7248}
7249
7250/// PerformVMULCombine
7251/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
7252/// special multiplier accumulator forwarding.
7253///   vmul d3, d0, d2
7254///   vmla d3, d1, d2
7255/// is faster than
7256///   vadd d3, d0, d1
7257///   vmul d3, d3, d2
7258static SDValue PerformVMULCombine(SDNode *N,
7259                                  TargetLowering::DAGCombinerInfo &DCI,
7260                                  const ARMSubtarget *Subtarget) {
7261  if (!Subtarget->hasVMLxForwarding())
7262    return SDValue();
7263
7264  SelectionDAG &DAG = DCI.DAG;
7265  SDValue N0 = N->getOperand(0);
7266  SDValue N1 = N->getOperand(1);
7267  unsigned Opcode = N0.getOpcode();
7268  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
7269      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
7270    Opcode = N1.getOpcode();
7271    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
7272        Opcode != ISD::FADD && Opcode != ISD::FSUB)
7273      return SDValue();
7274    std::swap(N0, N1);
7275  }
7276
7277  EVT VT = N->getValueType(0);
7278  DebugLoc DL = N->getDebugLoc();
7279  SDValue N00 = N0->getOperand(0);
7280  SDValue N01 = N0->getOperand(1);
7281  return DAG.getNode(Opcode, DL, VT,
7282                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
7283                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
7284}
7285
7286static SDValue PerformMULCombine(SDNode *N,
7287                                 TargetLowering::DAGCombinerInfo &DCI,
7288                                 const ARMSubtarget *Subtarget) {
7289  SelectionDAG &DAG = DCI.DAG;
7290
7291  if (Subtarget->isThumb1Only())
7292    return SDValue();
7293
7294  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
7295    return SDValue();
7296
7297  EVT VT = N->getValueType(0);
7298  if (VT.is64BitVector() || VT.is128BitVector())
7299    return PerformVMULCombine(N, DCI, Subtarget);
7300  if (VT != MVT::i32)
7301    return SDValue();
7302
7303  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
7304  if (!C)
7305    return SDValue();
7306
7307  int64_t MulAmt = C->getSExtValue();
7308  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
7309
7310  ShiftAmt = ShiftAmt & (32 - 1);
7311  SDValue V = N->getOperand(0);
7312  DebugLoc DL = N->getDebugLoc();
7313
7314  SDValue Res;
7315  MulAmt >>= ShiftAmt;
7316
7317  if (MulAmt >= 0) {
7318    if (isPowerOf2_32(MulAmt - 1)) {
7319      // (mul x, 2^N + 1) => (add (shl x, N), x)
7320      Res = DAG.getNode(ISD::ADD, DL, VT,
7321                        V,
7322                        DAG.getNode(ISD::SHL, DL, VT,
7323                                    V,
7324                                    DAG.getConstant(Log2_32(MulAmt - 1),
7325                                                    MVT::i32)));
7326    } else if (isPowerOf2_32(MulAmt + 1)) {
7327      // (mul x, 2^N - 1) => (sub (shl x, N), x)
7328      Res = DAG.getNode(ISD::SUB, DL, VT,
7329                        DAG.getNode(ISD::SHL, DL, VT,
7330                                    V,
7331                                    DAG.getConstant(Log2_32(MulAmt + 1),
7332                                                    MVT::i32)),
7333                        V);
7334    } else
7335      return SDValue();
7336  } else {
7337    uint64_t MulAmtAbs = -MulAmt;
7338    if (isPowerOf2_32(MulAmtAbs + 1)) {
7339      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
7340      Res = DAG.getNode(ISD::SUB, DL, VT,
7341                        V,
7342                        DAG.getNode(ISD::SHL, DL, VT,
7343                                    V,
7344                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
7345                                                    MVT::i32)));
7346    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
7347      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
7348      Res = DAG.getNode(ISD::ADD, DL, VT,
7349                        V,
7350                        DAG.getNode(ISD::SHL, DL, VT,
7351                                    V,
7352                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
7353                                                    MVT::i32)));
7354      Res = DAG.getNode(ISD::SUB, DL, VT,
7355                        DAG.getConstant(0, MVT::i32),Res);
7356
7357    } else
7358      return SDValue();
7359  }
7360
7361  if (ShiftAmt != 0)
7362    Res = DAG.getNode(ISD::SHL, DL, VT,
7363                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
7364
7365  // Do not add new nodes to DAG combiner worklist.
7366  DCI.CombineTo(N, Res, false);
7367  return SDValue();
7368}
7369
7370static SDValue PerformANDCombine(SDNode *N,
7371                                 TargetLowering::DAGCombinerInfo &DCI,
7372                                 const ARMSubtarget *Subtarget) {
7373
7374  // Attempt to use immediate-form VBIC
7375  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
7376  DebugLoc dl = N->getDebugLoc();
7377  EVT VT = N->getValueType(0);
7378  SelectionDAG &DAG = DCI.DAG;
7379
7380  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
7381    return SDValue();
7382
7383  APInt SplatBits, SplatUndef;
7384  unsigned SplatBitSize;
7385  bool HasAnyUndefs;
7386  if (BVN &&
7387      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7388    if (SplatBitSize <= 64) {
7389      EVT VbicVT;
7390      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
7391                                      SplatUndef.getZExtValue(), SplatBitSize,
7392                                      DAG, VbicVT, VT.is128BitVector(),
7393                                      OtherModImm);
7394      if (Val.getNode()) {
7395        SDValue Input =
7396          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
7397        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
7398        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
7399      }
7400    }
7401  }
7402
7403  if (!Subtarget->isThumb1Only()) {
7404    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
7405    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
7406    if (Result.getNode())
7407      return Result;
7408  }
7409
7410  return SDValue();
7411}
7412
7413/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
7414static SDValue PerformORCombine(SDNode *N,
7415                                TargetLowering::DAGCombinerInfo &DCI,
7416                                const ARMSubtarget *Subtarget) {
7417  // Attempt to use immediate-form VORR
7418  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
7419  DebugLoc dl = N->getDebugLoc();
7420  EVT VT = N->getValueType(0);
7421  SelectionDAG &DAG = DCI.DAG;
7422
7423  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
7424    return SDValue();
7425
7426  APInt SplatBits, SplatUndef;
7427  unsigned SplatBitSize;
7428  bool HasAnyUndefs;
7429  if (BVN && Subtarget->hasNEON() &&
7430      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7431    if (SplatBitSize <= 64) {
7432      EVT VorrVT;
7433      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
7434                                      SplatUndef.getZExtValue(), SplatBitSize,
7435                                      DAG, VorrVT, VT.is128BitVector(),
7436                                      OtherModImm);
7437      if (Val.getNode()) {
7438        SDValue Input =
7439          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
7440        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
7441        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
7442      }
7443    }
7444  }
7445
7446  if (!Subtarget->isThumb1Only()) {
7447    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
7448    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
7449    if (Result.getNode())
7450      return Result;
7451  }
7452
7453  // The code below optimizes (or (and X, Y), Z).
7454  // The AND operand needs to have a single user to make these optimizations
7455  // profitable.
7456  SDValue N0 = N->getOperand(0);
7457  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
7458    return SDValue();
7459  SDValue N1 = N->getOperand(1);
7460
7461  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
7462  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
7463      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
7464    APInt SplatUndef;
7465    unsigned SplatBitSize;
7466    bool HasAnyUndefs;
7467
7468    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
7469    APInt SplatBits0;
7470    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
7471                                  HasAnyUndefs) && !HasAnyUndefs) {
7472      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
7473      APInt SplatBits1;
7474      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
7475                                    HasAnyUndefs) && !HasAnyUndefs &&
7476          SplatBits0 == ~SplatBits1) {
7477        // Canonicalize the vector type to make instruction selection simpler.
7478        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
7479        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
7480                                     N0->getOperand(1), N0->getOperand(0),
7481                                     N1->getOperand(0));
7482        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
7483      }
7484    }
7485  }
7486
7487  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
7488  // reasonable.
7489
7490  // BFI is only available on V6T2+
7491  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
7492    return SDValue();
7493
7494  DebugLoc DL = N->getDebugLoc();
7495  // 1) or (and A, mask), val => ARMbfi A, val, mask
7496  //      iff (val & mask) == val
7497  //
7498  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
7499  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
7500  //          && mask == ~mask2
7501  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
7502  //          && ~mask == mask2
7503  //  (i.e., copy a bitfield value into another bitfield of the same width)
7504
7505  if (VT != MVT::i32)
7506    return SDValue();
7507
7508  SDValue N00 = N0.getOperand(0);
7509
7510  // The value and the mask need to be constants so we can verify this is
7511  // actually a bitfield set. If the mask is 0xffff, we can do better
7512  // via a movt instruction, so don't use BFI in that case.
7513  SDValue MaskOp = N0.getOperand(1);
7514  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
7515  if (!MaskC)
7516    return SDValue();
7517  unsigned Mask = MaskC->getZExtValue();
7518  if (Mask == 0xffff)
7519    return SDValue();
7520  SDValue Res;
7521  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
7522  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
7523  if (N1C) {
7524    unsigned Val = N1C->getZExtValue();
7525    if ((Val & ~Mask) != Val)
7526      return SDValue();
7527
7528    if (ARM::isBitFieldInvertedMask(Mask)) {
7529      Val >>= CountTrailingZeros_32(~Mask);
7530
7531      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
7532                        DAG.getConstant(Val, MVT::i32),
7533                        DAG.getConstant(Mask, MVT::i32));
7534
7535      // Do not add new nodes to DAG combiner worklist.
7536      DCI.CombineTo(N, Res, false);
7537      return SDValue();
7538    }
7539  } else if (N1.getOpcode() == ISD::AND) {
7540    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
7541    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
7542    if (!N11C)
7543      return SDValue();
7544    unsigned Mask2 = N11C->getZExtValue();
7545
7546    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
7547    // as is to match.
7548    if (ARM::isBitFieldInvertedMask(Mask) &&
7549        (Mask == ~Mask2)) {
7550      // The pack halfword instruction works better for masks that fit it,
7551      // so use that when it's available.
7552      if (Subtarget->hasT2ExtractPack() &&
7553          (Mask == 0xffff || Mask == 0xffff0000))
7554        return SDValue();
7555      // 2a
7556      unsigned amt = CountTrailingZeros_32(Mask2);
7557      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
7558                        DAG.getConstant(amt, MVT::i32));
7559      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
7560                        DAG.getConstant(Mask, MVT::i32));
7561      // Do not add new nodes to DAG combiner worklist.
7562      DCI.CombineTo(N, Res, false);
7563      return SDValue();
7564    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
7565               (~Mask == Mask2)) {
7566      // The pack halfword instruction works better for masks that fit it,
7567      // so use that when it's available.
7568      if (Subtarget->hasT2ExtractPack() &&
7569          (Mask2 == 0xffff || Mask2 == 0xffff0000))
7570        return SDValue();
7571      // 2b
7572      unsigned lsb = CountTrailingZeros_32(Mask);
7573      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
7574                        DAG.getConstant(lsb, MVT::i32));
7575      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
7576                        DAG.getConstant(Mask2, MVT::i32));
7577      // Do not add new nodes to DAG combiner worklist.
7578      DCI.CombineTo(N, Res, false);
7579      return SDValue();
7580    }
7581  }
7582
7583  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
7584      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
7585      ARM::isBitFieldInvertedMask(~Mask)) {
7586    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
7587    // where lsb(mask) == #shamt and masked bits of B are known zero.
7588    SDValue ShAmt = N00.getOperand(1);
7589    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
7590    unsigned LSB = CountTrailingZeros_32(Mask);
7591    if (ShAmtC != LSB)
7592      return SDValue();
7593
7594    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
7595                      DAG.getConstant(~Mask, MVT::i32));
7596
7597    // Do not add new nodes to DAG combiner worklist.
7598    DCI.CombineTo(N, Res, false);
7599  }
7600
7601  return SDValue();
7602}
7603
7604static SDValue PerformXORCombine(SDNode *N,
7605                                 TargetLowering::DAGCombinerInfo &DCI,
7606                                 const ARMSubtarget *Subtarget) {
7607  EVT VT = N->getValueType(0);
7608  SelectionDAG &DAG = DCI.DAG;
7609
7610  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
7611    return SDValue();
7612
7613  if (!Subtarget->isThumb1Only()) {
7614    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
7615    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
7616    if (Result.getNode())
7617      return Result;
7618  }
7619
7620  return SDValue();
7621}
7622
7623/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
7624/// the bits being cleared by the AND are not demanded by the BFI.
7625static SDValue PerformBFICombine(SDNode *N,
7626                                 TargetLowering::DAGCombinerInfo &DCI) {
7627  SDValue N1 = N->getOperand(1);
7628  if (N1.getOpcode() == ISD::AND) {
7629    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
7630    if (!N11C)
7631      return SDValue();
7632    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
7633    unsigned LSB = CountTrailingZeros_32(~InvMask);
7634    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
7635    unsigned Mask = (1 << Width)-1;
7636    unsigned Mask2 = N11C->getZExtValue();
7637    if ((Mask & (~Mask2)) == 0)
7638      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
7639                             N->getOperand(0), N1.getOperand(0),
7640                             N->getOperand(2));
7641  }
7642  return SDValue();
7643}
7644
7645/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
7646/// ARMISD::VMOVRRD.
7647static SDValue PerformVMOVRRDCombine(SDNode *N,
7648                                     TargetLowering::DAGCombinerInfo &DCI) {
7649  // vmovrrd(vmovdrr x, y) -> x,y
7650  SDValue InDouble = N->getOperand(0);
7651  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
7652    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
7653
7654  // vmovrrd(load f64) -> (load i32), (load i32)
7655  SDNode *InNode = InDouble.getNode();
7656  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
7657      InNode->getValueType(0) == MVT::f64 &&
7658      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
7659      !cast<LoadSDNode>(InNode)->isVolatile()) {
7660    // TODO: Should this be done for non-FrameIndex operands?
7661    LoadSDNode *LD = cast<LoadSDNode>(InNode);
7662
7663    SelectionDAG &DAG = DCI.DAG;
7664    DebugLoc DL = LD->getDebugLoc();
7665    SDValue BasePtr = LD->getBasePtr();
7666    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
7667                                 LD->getPointerInfo(), LD->isVolatile(),
7668                                 LD->isNonTemporal(), LD->isInvariant(),
7669                                 LD->getAlignment());
7670
7671    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
7672                                    DAG.getConstant(4, MVT::i32));
7673    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
7674                                 LD->getPointerInfo(), LD->isVolatile(),
7675                                 LD->isNonTemporal(), LD->isInvariant(),
7676                                 std::min(4U, LD->getAlignment() / 2));
7677
7678    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
7679    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
7680    DCI.RemoveFromWorklist(LD);
7681    DAG.DeleteNode(LD);
7682    return Result;
7683  }
7684
7685  return SDValue();
7686}
7687
7688/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
7689/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
7690static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
7691  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
7692  SDValue Op0 = N->getOperand(0);
7693  SDValue Op1 = N->getOperand(1);
7694  if (Op0.getOpcode() == ISD::BITCAST)
7695    Op0 = Op0.getOperand(0);
7696  if (Op1.getOpcode() == ISD::BITCAST)
7697    Op1 = Op1.getOperand(0);
7698  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
7699      Op0.getNode() == Op1.getNode() &&
7700      Op0.getResNo() == 0 && Op1.getResNo() == 1)
7701    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
7702                       N->getValueType(0), Op0.getOperand(0));
7703  return SDValue();
7704}
7705
7706/// PerformSTORECombine - Target-specific dag combine xforms for
7707/// ISD::STORE.
7708static SDValue PerformSTORECombine(SDNode *N,
7709                                   TargetLowering::DAGCombinerInfo &DCI) {
7710  StoreSDNode *St = cast<StoreSDNode>(N);
7711  if (St->isVolatile())
7712    return SDValue();
7713
7714  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
7715  // pack all of the elements in one place.  Next, store to memory in fewer
7716  // chunks.
7717  SDValue StVal = St->getValue();
7718  EVT VT = StVal.getValueType();
7719  if (St->isTruncatingStore() && VT.isVector()) {
7720    SelectionDAG &DAG = DCI.DAG;
7721    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7722    EVT StVT = St->getMemoryVT();
7723    unsigned NumElems = VT.getVectorNumElements();
7724    assert(StVT != VT && "Cannot truncate to the same type");
7725    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
7726    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
7727
7728    // From, To sizes and ElemCount must be pow of two
7729    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
7730
7731    // We are going to use the original vector elt for storing.
7732    // Accumulated smaller vector elements must be a multiple of the store size.
7733    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
7734
7735    unsigned SizeRatio  = FromEltSz / ToEltSz;
7736    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
7737
7738    // Create a type on which we perform the shuffle.
7739    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
7740                                     NumElems*SizeRatio);
7741    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
7742
7743    DebugLoc DL = St->getDebugLoc();
7744    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
7745    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
7746    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
7747
7748    // Can't shuffle using an illegal type.
7749    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
7750
7751    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
7752                                DAG.getUNDEF(WideVec.getValueType()),
7753                                ShuffleVec.data());
7754    // At this point all of the data is stored at the bottom of the
7755    // register. We now need to save it to mem.
7756
7757    // Find the largest store unit
7758    MVT StoreType = MVT::i8;
7759    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
7760         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
7761      MVT Tp = (MVT::SimpleValueType)tp;
7762      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
7763        StoreType = Tp;
7764    }
7765    // Didn't find a legal store type.
7766    if (!TLI.isTypeLegal(StoreType))
7767      return SDValue();
7768
7769    // Bitcast the original vector into a vector of store-size units
7770    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
7771            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
7772    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
7773    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
7774    SmallVector<SDValue, 8> Chains;
7775    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
7776                                        TLI.getPointerTy());
7777    SDValue BasePtr = St->getBasePtr();
7778
7779    // Perform one or more big stores into memory.
7780    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
7781    for (unsigned I = 0; I < E; I++) {
7782      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
7783                                   StoreType, ShuffWide,
7784                                   DAG.getIntPtrConstant(I));
7785      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
7786                                St->getPointerInfo(), St->isVolatile(),
7787                                St->isNonTemporal(), St->getAlignment());
7788      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
7789                            Increment);
7790      Chains.push_back(Ch);
7791    }
7792    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
7793                       Chains.size());
7794  }
7795
7796  if (!ISD::isNormalStore(St))
7797    return SDValue();
7798
7799  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
7800  // ARM stores of arguments in the same cache line.
7801  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
7802      StVal.getNode()->hasOneUse()) {
7803    SelectionDAG  &DAG = DCI.DAG;
7804    DebugLoc DL = St->getDebugLoc();
7805    SDValue BasePtr = St->getBasePtr();
7806    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
7807                                  StVal.getNode()->getOperand(0), BasePtr,
7808                                  St->getPointerInfo(), St->isVolatile(),
7809                                  St->isNonTemporal(), St->getAlignment());
7810
7811    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
7812                                    DAG.getConstant(4, MVT::i32));
7813    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
7814                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
7815                        St->isNonTemporal(),
7816                        std::min(4U, St->getAlignment() / 2));
7817  }
7818
7819  if (StVal.getValueType() != MVT::i64 ||
7820      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7821    return SDValue();
7822
7823  // Bitcast an i64 store extracted from a vector to f64.
7824  // Otherwise, the i64 value will be legalized to a pair of i32 values.
7825  SelectionDAG &DAG = DCI.DAG;
7826  DebugLoc dl = StVal.getDebugLoc();
7827  SDValue IntVec = StVal.getOperand(0);
7828  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
7829                                 IntVec.getValueType().getVectorNumElements());
7830  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
7831  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
7832                               Vec, StVal.getOperand(1));
7833  dl = N->getDebugLoc();
7834  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
7835  // Make the DAGCombiner fold the bitcasts.
7836  DCI.AddToWorklist(Vec.getNode());
7837  DCI.AddToWorklist(ExtElt.getNode());
7838  DCI.AddToWorklist(V.getNode());
7839  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
7840                      St->getPointerInfo(), St->isVolatile(),
7841                      St->isNonTemporal(), St->getAlignment(),
7842                      St->getTBAAInfo());
7843}
7844
7845/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
7846/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
7847/// i64 vector to have f64 elements, since the value can then be loaded
7848/// directly into a VFP register.
7849static bool hasNormalLoadOperand(SDNode *N) {
7850  unsigned NumElts = N->getValueType(0).getVectorNumElements();
7851  for (unsigned i = 0; i < NumElts; ++i) {
7852    SDNode *Elt = N->getOperand(i).getNode();
7853    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
7854      return true;
7855  }
7856  return false;
7857}
7858
7859/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
7860/// ISD::BUILD_VECTOR.
7861static SDValue PerformBUILD_VECTORCombine(SDNode *N,
7862                                          TargetLowering::DAGCombinerInfo &DCI){
7863  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
7864  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
7865  // into a pair of GPRs, which is fine when the value is used as a scalar,
7866  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
7867  SelectionDAG &DAG = DCI.DAG;
7868  if (N->getNumOperands() == 2) {
7869    SDValue RV = PerformVMOVDRRCombine(N, DAG);
7870    if (RV.getNode())
7871      return RV;
7872  }
7873
7874  // Load i64 elements as f64 values so that type legalization does not split
7875  // them up into i32 values.
7876  EVT VT = N->getValueType(0);
7877  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
7878    return SDValue();
7879  DebugLoc dl = N->getDebugLoc();
7880  SmallVector<SDValue, 8> Ops;
7881  unsigned NumElts = VT.getVectorNumElements();
7882  for (unsigned i = 0; i < NumElts; ++i) {
7883    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
7884    Ops.push_back(V);
7885    // Make the DAGCombiner fold the bitcast.
7886    DCI.AddToWorklist(V.getNode());
7887  }
7888  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
7889  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
7890  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
7891}
7892
7893/// PerformInsertEltCombine - Target-specific dag combine xforms for
7894/// ISD::INSERT_VECTOR_ELT.
7895static SDValue PerformInsertEltCombine(SDNode *N,
7896                                       TargetLowering::DAGCombinerInfo &DCI) {
7897  // Bitcast an i64 load inserted into a vector to f64.
7898  // Otherwise, the i64 value will be legalized to a pair of i32 values.
7899  EVT VT = N->getValueType(0);
7900  SDNode *Elt = N->getOperand(1).getNode();
7901  if (VT.getVectorElementType() != MVT::i64 ||
7902      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
7903    return SDValue();
7904
7905  SelectionDAG &DAG = DCI.DAG;
7906  DebugLoc dl = N->getDebugLoc();
7907  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
7908                                 VT.getVectorNumElements());
7909  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
7910  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
7911  // Make the DAGCombiner fold the bitcasts.
7912  DCI.AddToWorklist(Vec.getNode());
7913  DCI.AddToWorklist(V.getNode());
7914  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
7915                               Vec, V, N->getOperand(2));
7916  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
7917}
7918
7919/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
7920/// ISD::VECTOR_SHUFFLE.
7921static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
7922  // The LLVM shufflevector instruction does not require the shuffle mask
7923  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
7924  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
7925  // operands do not match the mask length, they are extended by concatenating
7926  // them with undef vectors.  That is probably the right thing for other
7927  // targets, but for NEON it is better to concatenate two double-register
7928  // size vector operands into a single quad-register size vector.  Do that
7929  // transformation here:
7930  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
7931  //   shuffle(concat(v1, v2), undef)
7932  SDValue Op0 = N->getOperand(0);
7933  SDValue Op1 = N->getOperand(1);
7934  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
7935      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
7936      Op0.getNumOperands() != 2 ||
7937      Op1.getNumOperands() != 2)
7938    return SDValue();
7939  SDValue Concat0Op1 = Op0.getOperand(1);
7940  SDValue Concat1Op1 = Op1.getOperand(1);
7941  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
7942      Concat1Op1.getOpcode() != ISD::UNDEF)
7943    return SDValue();
7944  // Skip the transformation if any of the types are illegal.
7945  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7946  EVT VT = N->getValueType(0);
7947  if (!TLI.isTypeLegal(VT) ||
7948      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
7949      !TLI.isTypeLegal(Concat1Op1.getValueType()))
7950    return SDValue();
7951
7952  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
7953                                  Op0.getOperand(0), Op1.getOperand(0));
7954  // Translate the shuffle mask.
7955  SmallVector<int, 16> NewMask;
7956  unsigned NumElts = VT.getVectorNumElements();
7957  unsigned HalfElts = NumElts/2;
7958  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
7959  for (unsigned n = 0; n < NumElts; ++n) {
7960    int MaskElt = SVN->getMaskElt(n);
7961    int NewElt = -1;
7962    if (MaskElt < (int)HalfElts)
7963      NewElt = MaskElt;
7964    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
7965      NewElt = HalfElts + MaskElt - NumElts;
7966    NewMask.push_back(NewElt);
7967  }
7968  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
7969                              DAG.getUNDEF(VT), NewMask.data());
7970}
7971
7972/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
7973/// NEON load/store intrinsics to merge base address updates.
7974static SDValue CombineBaseUpdate(SDNode *N,
7975                                 TargetLowering::DAGCombinerInfo &DCI) {
7976  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
7977    return SDValue();
7978
7979  SelectionDAG &DAG = DCI.DAG;
7980  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
7981                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
7982  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
7983  SDValue Addr = N->getOperand(AddrOpIdx);
7984
7985  // Search for a use of the address operand that is an increment.
7986  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
7987         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
7988    SDNode *User = *UI;
7989    if (User->getOpcode() != ISD::ADD ||
7990        UI.getUse().getResNo() != Addr.getResNo())
7991      continue;
7992
7993    // Check that the add is independent of the load/store.  Otherwise, folding
7994    // it would create a cycle.
7995    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
7996      continue;
7997
7998    // Find the new opcode for the updating load/store.
7999    bool isLoad = true;
8000    bool isLaneOp = false;
8001    unsigned NewOpc = 0;
8002    unsigned NumVecs = 0;
8003    if (isIntrinsic) {
8004      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
8005      switch (IntNo) {
8006      default: llvm_unreachable("unexpected intrinsic for Neon base update");
8007      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
8008        NumVecs = 1; break;
8009      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
8010        NumVecs = 2; break;
8011      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
8012        NumVecs = 3; break;
8013      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
8014        NumVecs = 4; break;
8015      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
8016        NumVecs = 2; isLaneOp = true; break;
8017      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
8018        NumVecs = 3; isLaneOp = true; break;
8019      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
8020        NumVecs = 4; isLaneOp = true; break;
8021      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
8022        NumVecs = 1; isLoad = false; break;
8023      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
8024        NumVecs = 2; isLoad = false; break;
8025      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
8026        NumVecs = 3; isLoad = false; break;
8027      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
8028        NumVecs = 4; isLoad = false; break;
8029      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
8030        NumVecs = 2; isLoad = false; isLaneOp = true; break;
8031      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
8032        NumVecs = 3; isLoad = false; isLaneOp = true; break;
8033      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
8034        NumVecs = 4; isLoad = false; isLaneOp = true; break;
8035      }
8036    } else {
8037      isLaneOp = true;
8038      switch (N->getOpcode()) {
8039      default: llvm_unreachable("unexpected opcode for Neon base update");
8040      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
8041      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
8042      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
8043      }
8044    }
8045
8046    // Find the size of memory referenced by the load/store.
8047    EVT VecTy;
8048    if (isLoad)
8049      VecTy = N->getValueType(0);
8050    else
8051      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
8052    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
8053    if (isLaneOp)
8054      NumBytes /= VecTy.getVectorNumElements();
8055
8056    // If the increment is a constant, it must match the memory ref size.
8057    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
8058    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
8059      uint64_t IncVal = CInc->getZExtValue();
8060      if (IncVal != NumBytes)
8061        continue;
8062    } else if (NumBytes >= 3 * 16) {
8063      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
8064      // separate instructions that make it harder to use a non-constant update.
8065      continue;
8066    }
8067
8068    // Create the new updating load/store node.
8069    EVT Tys[6];
8070    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
8071    unsigned n;
8072    for (n = 0; n < NumResultVecs; ++n)
8073      Tys[n] = VecTy;
8074    Tys[n++] = MVT::i32;
8075    Tys[n] = MVT::Other;
8076    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
8077    SmallVector<SDValue, 8> Ops;
8078    Ops.push_back(N->getOperand(0)); // incoming chain
8079    Ops.push_back(N->getOperand(AddrOpIdx));
8080    Ops.push_back(Inc);
8081    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
8082      Ops.push_back(N->getOperand(i));
8083    }
8084    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
8085    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
8086                                           Ops.data(), Ops.size(),
8087                                           MemInt->getMemoryVT(),
8088                                           MemInt->getMemOperand());
8089
8090    // Update the uses.
8091    std::vector<SDValue> NewResults;
8092    for (unsigned i = 0; i < NumResultVecs; ++i) {
8093      NewResults.push_back(SDValue(UpdN.getNode(), i));
8094    }
8095    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
8096    DCI.CombineTo(N, NewResults);
8097    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
8098
8099    break;
8100  }
8101  return SDValue();
8102}
8103
8104/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
8105/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
8106/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
8107/// return true.
8108static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
8109  SelectionDAG &DAG = DCI.DAG;
8110  EVT VT = N->getValueType(0);
8111  // vldN-dup instructions only support 64-bit vectors for N > 1.
8112  if (!VT.is64BitVector())
8113    return false;
8114
8115  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
8116  SDNode *VLD = N->getOperand(0).getNode();
8117  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
8118    return false;
8119  unsigned NumVecs = 0;
8120  unsigned NewOpc = 0;
8121  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
8122  if (IntNo == Intrinsic::arm_neon_vld2lane) {
8123    NumVecs = 2;
8124    NewOpc = ARMISD::VLD2DUP;
8125  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
8126    NumVecs = 3;
8127    NewOpc = ARMISD::VLD3DUP;
8128  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
8129    NumVecs = 4;
8130    NewOpc = ARMISD::VLD4DUP;
8131  } else {
8132    return false;
8133  }
8134
8135  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
8136  // numbers match the load.
8137  unsigned VLDLaneNo =
8138    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
8139  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
8140       UI != UE; ++UI) {
8141    // Ignore uses of the chain result.
8142    if (UI.getUse().getResNo() == NumVecs)
8143      continue;
8144    SDNode *User = *UI;
8145    if (User->getOpcode() != ARMISD::VDUPLANE ||
8146        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
8147      return false;
8148  }
8149
8150  // Create the vldN-dup node.
8151  EVT Tys[5];
8152  unsigned n;
8153  for (n = 0; n < NumVecs; ++n)
8154    Tys[n] = VT;
8155  Tys[n] = MVT::Other;
8156  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
8157  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
8158  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
8159  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
8160                                           Ops, 2, VLDMemInt->getMemoryVT(),
8161                                           VLDMemInt->getMemOperand());
8162
8163  // Update the uses.
8164  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
8165       UI != UE; ++UI) {
8166    unsigned ResNo = UI.getUse().getResNo();
8167    // Ignore uses of the chain result.
8168    if (ResNo == NumVecs)
8169      continue;
8170    SDNode *User = *UI;
8171    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
8172  }
8173
8174  // Now the vldN-lane intrinsic is dead except for its chain result.
8175  // Update uses of the chain.
8176  std::vector<SDValue> VLDDupResults;
8177  for (unsigned n = 0; n < NumVecs; ++n)
8178    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
8179  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
8180  DCI.CombineTo(VLD, VLDDupResults);
8181
8182  return true;
8183}
8184
8185/// PerformVDUPLANECombine - Target-specific dag combine xforms for
8186/// ARMISD::VDUPLANE.
8187static SDValue PerformVDUPLANECombine(SDNode *N,
8188                                      TargetLowering::DAGCombinerInfo &DCI) {
8189  SDValue Op = N->getOperand(0);
8190
8191  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
8192  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
8193  if (CombineVLDDUP(N, DCI))
8194    return SDValue(N, 0);
8195
8196  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
8197  // redundant.  Ignore bit_converts for now; element sizes are checked below.
8198  while (Op.getOpcode() == ISD::BITCAST)
8199    Op = Op.getOperand(0);
8200  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
8201    return SDValue();
8202
8203  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
8204  unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
8205  // The canonical VMOV for a zero vector uses a 32-bit element size.
8206  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8207  unsigned EltBits;
8208  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
8209    EltSize = 8;
8210  EVT VT = N->getValueType(0);
8211  if (EltSize > VT.getVectorElementType().getSizeInBits())
8212    return SDValue();
8213
8214  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
8215}
8216
8217// isConstVecPow2 - Return true if each vector element is a power of 2, all
8218// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
8219static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
8220{
8221  integerPart cN;
8222  integerPart c0 = 0;
8223  for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
8224       I != E; I++) {
8225    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
8226    if (!C)
8227      return false;
8228
8229    bool isExact;
8230    APFloat APF = C->getValueAPF();
8231    if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
8232        != APFloat::opOK || !isExact)
8233      return false;
8234
8235    c0 = (I == 0) ? cN : c0;
8236    if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
8237      return false;
8238  }
8239  C = c0;
8240  return true;
8241}
8242
8243/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
8244/// can replace combinations of VMUL and VCVT (floating-point to integer)
8245/// when the VMUL has a constant operand that is a power of 2.
8246///
8247/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
8248///  vmul.f32        d16, d17, d16
8249///  vcvt.s32.f32    d16, d16
8250/// becomes:
8251///  vcvt.s32.f32    d16, d16, #3
8252static SDValue PerformVCVTCombine(SDNode *N,
8253                                  TargetLowering::DAGCombinerInfo &DCI,
8254                                  const ARMSubtarget *Subtarget) {
8255  SelectionDAG &DAG = DCI.DAG;
8256  SDValue Op = N->getOperand(0);
8257
8258  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
8259      Op.getOpcode() != ISD::FMUL)
8260    return SDValue();
8261
8262  uint64_t C;
8263  SDValue N0 = Op->getOperand(0);
8264  SDValue ConstVec = Op->getOperand(1);
8265  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
8266
8267  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
8268      !isConstVecPow2(ConstVec, isSigned, C))
8269    return SDValue();
8270
8271  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
8272    Intrinsic::arm_neon_vcvtfp2fxu;
8273  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
8274                     N->getValueType(0),
8275                     DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
8276                     DAG.getConstant(Log2_64(C), MVT::i32));
8277}
8278
8279/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
8280/// can replace combinations of VCVT (integer to floating-point) and VDIV
8281/// when the VDIV has a constant operand that is a power of 2.
8282///
8283/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
8284///  vcvt.f32.s32    d16, d16
8285///  vdiv.f32        d16, d17, d16
8286/// becomes:
8287///  vcvt.f32.s32    d16, d16, #3
8288static SDValue PerformVDIVCombine(SDNode *N,
8289                                  TargetLowering::DAGCombinerInfo &DCI,
8290                                  const ARMSubtarget *Subtarget) {
8291  SelectionDAG &DAG = DCI.DAG;
8292  SDValue Op = N->getOperand(0);
8293  unsigned OpOpcode = Op.getNode()->getOpcode();
8294
8295  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
8296      (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
8297    return SDValue();
8298
8299  uint64_t C;
8300  SDValue ConstVec = N->getOperand(1);
8301  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
8302
8303  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
8304      !isConstVecPow2(ConstVec, isSigned, C))
8305    return SDValue();
8306
8307  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
8308    Intrinsic::arm_neon_vcvtfxu2fp;
8309  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
8310                     Op.getValueType(),
8311                     DAG.getConstant(IntrinsicOpcode, MVT::i32),
8312                     Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
8313}
8314
8315/// Getvshiftimm - Check if this is a valid build_vector for the immediate
8316/// operand of a vector shift operation, where all the elements of the
8317/// build_vector must have the same constant integer value.
8318static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
8319  // Ignore bit_converts.
8320  while (Op.getOpcode() == ISD::BITCAST)
8321    Op = Op.getOperand(0);
8322  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
8323  APInt SplatBits, SplatUndef;
8324  unsigned SplatBitSize;
8325  bool HasAnyUndefs;
8326  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
8327                                      HasAnyUndefs, ElementBits) ||
8328      SplatBitSize > ElementBits)
8329    return false;
8330  Cnt = SplatBits.getSExtValue();
8331  return true;
8332}
8333
8334/// isVShiftLImm - Check if this is a valid build_vector for the immediate
8335/// operand of a vector shift left operation.  That value must be in the range:
8336///   0 <= Value < ElementBits for a left shift; or
8337///   0 <= Value <= ElementBits for a long left shift.
8338static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
8339  assert(VT.isVector() && "vector shift count is not a vector type");
8340  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
8341  if (! getVShiftImm(Op, ElementBits, Cnt))
8342    return false;
8343  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
8344}
8345
8346/// isVShiftRImm - Check if this is a valid build_vector for the immediate
8347/// operand of a vector shift right operation.  For a shift opcode, the value
8348/// is positive, but for an intrinsic the value count must be negative. The
8349/// absolute value must be in the range:
8350///   1 <= |Value| <= ElementBits for a right shift; or
8351///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
8352static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
8353                         int64_t &Cnt) {
8354  assert(VT.isVector() && "vector shift count is not a vector type");
8355  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
8356  if (! getVShiftImm(Op, ElementBits, Cnt))
8357    return false;
8358  if (isIntrinsic)
8359    Cnt = -Cnt;
8360  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
8361}
8362
8363/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
8364static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
8365  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
8366  switch (IntNo) {
8367  default:
8368    // Don't do anything for most intrinsics.
8369    break;
8370
8371  // Vector shifts: check for immediate versions and lower them.
8372  // Note: This is done during DAG combining instead of DAG legalizing because
8373  // the build_vectors for 64-bit vector element shift counts are generally
8374  // not legal, and it is hard to see their values after they get legalized to
8375  // loads from a constant pool.
8376  case Intrinsic::arm_neon_vshifts:
8377  case Intrinsic::arm_neon_vshiftu:
8378  case Intrinsic::arm_neon_vshiftls:
8379  case Intrinsic::arm_neon_vshiftlu:
8380  case Intrinsic::arm_neon_vshiftn:
8381  case Intrinsic::arm_neon_vrshifts:
8382  case Intrinsic::arm_neon_vrshiftu:
8383  case Intrinsic::arm_neon_vrshiftn:
8384  case Intrinsic::arm_neon_vqshifts:
8385  case Intrinsic::arm_neon_vqshiftu:
8386  case Intrinsic::arm_neon_vqshiftsu:
8387  case Intrinsic::arm_neon_vqshiftns:
8388  case Intrinsic::arm_neon_vqshiftnu:
8389  case Intrinsic::arm_neon_vqshiftnsu:
8390  case Intrinsic::arm_neon_vqrshiftns:
8391  case Intrinsic::arm_neon_vqrshiftnu:
8392  case Intrinsic::arm_neon_vqrshiftnsu: {
8393    EVT VT = N->getOperand(1).getValueType();
8394    int64_t Cnt;
8395    unsigned VShiftOpc = 0;
8396
8397    switch (IntNo) {
8398    case Intrinsic::arm_neon_vshifts:
8399    case Intrinsic::arm_neon_vshiftu:
8400      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
8401        VShiftOpc = ARMISD::VSHL;
8402        break;
8403      }
8404      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
8405        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
8406                     ARMISD::VSHRs : ARMISD::VSHRu);
8407        break;
8408      }
8409      return SDValue();
8410
8411    case Intrinsic::arm_neon_vshiftls:
8412    case Intrinsic::arm_neon_vshiftlu:
8413      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
8414        break;
8415      llvm_unreachable("invalid shift count for vshll intrinsic");
8416
8417    case Intrinsic::arm_neon_vrshifts:
8418    case Intrinsic::arm_neon_vrshiftu:
8419      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
8420        break;
8421      return SDValue();
8422
8423    case Intrinsic::arm_neon_vqshifts:
8424    case Intrinsic::arm_neon_vqshiftu:
8425      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
8426        break;
8427      return SDValue();
8428
8429    case Intrinsic::arm_neon_vqshiftsu:
8430      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
8431        break;
8432      llvm_unreachable("invalid shift count for vqshlu intrinsic");
8433
8434    case Intrinsic::arm_neon_vshiftn:
8435    case Intrinsic::arm_neon_vrshiftn:
8436    case Intrinsic::arm_neon_vqshiftns:
8437    case Intrinsic::arm_neon_vqshiftnu:
8438    case Intrinsic::arm_neon_vqshiftnsu:
8439    case Intrinsic::arm_neon_vqrshiftns:
8440    case Intrinsic::arm_neon_vqrshiftnu:
8441    case Intrinsic::arm_neon_vqrshiftnsu:
8442      // Narrowing shifts require an immediate right shift.
8443      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
8444        break;
8445      llvm_unreachable("invalid shift count for narrowing vector shift "
8446                       "intrinsic");
8447
8448    default:
8449      llvm_unreachable("unhandled vector shift");
8450    }
8451
8452    switch (IntNo) {
8453    case Intrinsic::arm_neon_vshifts:
8454    case Intrinsic::arm_neon_vshiftu:
8455      // Opcode already set above.
8456      break;
8457    case Intrinsic::arm_neon_vshiftls:
8458    case Intrinsic::arm_neon_vshiftlu:
8459      if (Cnt == VT.getVectorElementType().getSizeInBits())
8460        VShiftOpc = ARMISD::VSHLLi;
8461      else
8462        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
8463                     ARMISD::VSHLLs : ARMISD::VSHLLu);
8464      break;
8465    case Intrinsic::arm_neon_vshiftn:
8466      VShiftOpc = ARMISD::VSHRN; break;
8467    case Intrinsic::arm_neon_vrshifts:
8468      VShiftOpc = ARMISD::VRSHRs; break;
8469    case Intrinsic::arm_neon_vrshiftu:
8470      VShiftOpc = ARMISD::VRSHRu; break;
8471    case Intrinsic::arm_neon_vrshiftn:
8472      VShiftOpc = ARMISD::VRSHRN; break;
8473    case Intrinsic::arm_neon_vqshifts:
8474      VShiftOpc = ARMISD::VQSHLs; break;
8475    case Intrinsic::arm_neon_vqshiftu:
8476      VShiftOpc = ARMISD::VQSHLu; break;
8477    case Intrinsic::arm_neon_vqshiftsu:
8478      VShiftOpc = ARMISD::VQSHLsu; break;
8479    case Intrinsic::arm_neon_vqshiftns:
8480      VShiftOpc = ARMISD::VQSHRNs; break;
8481    case Intrinsic::arm_neon_vqshiftnu:
8482      VShiftOpc = ARMISD::VQSHRNu; break;
8483    case Intrinsic::arm_neon_vqshiftnsu:
8484      VShiftOpc = ARMISD::VQSHRNsu; break;
8485    case Intrinsic::arm_neon_vqrshiftns:
8486      VShiftOpc = ARMISD::VQRSHRNs; break;
8487    case Intrinsic::arm_neon_vqrshiftnu:
8488      VShiftOpc = ARMISD::VQRSHRNu; break;
8489    case Intrinsic::arm_neon_vqrshiftnsu:
8490      VShiftOpc = ARMISD::VQRSHRNsu; break;
8491    }
8492
8493    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
8494                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
8495  }
8496
8497  case Intrinsic::arm_neon_vshiftins: {
8498    EVT VT = N->getOperand(1).getValueType();
8499    int64_t Cnt;
8500    unsigned VShiftOpc = 0;
8501
8502    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
8503      VShiftOpc = ARMISD::VSLI;
8504    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
8505      VShiftOpc = ARMISD::VSRI;
8506    else {
8507      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
8508    }
8509
8510    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
8511                       N->getOperand(1), N->getOperand(2),
8512                       DAG.getConstant(Cnt, MVT::i32));
8513  }
8514
8515  case Intrinsic::arm_neon_vqrshifts:
8516  case Intrinsic::arm_neon_vqrshiftu:
8517    // No immediate versions of these to check for.
8518    break;
8519  }
8520
8521  return SDValue();
8522}
8523
8524/// PerformShiftCombine - Checks for immediate versions of vector shifts and
8525/// lowers them.  As with the vector shift intrinsics, this is done during DAG
8526/// combining instead of DAG legalizing because the build_vectors for 64-bit
8527/// vector element shift counts are generally not legal, and it is hard to see
8528/// their values after they get legalized to loads from a constant pool.
8529static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
8530                                   const ARMSubtarget *ST) {
8531  EVT VT = N->getValueType(0);
8532  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
8533    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
8534    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
8535    SDValue N1 = N->getOperand(1);
8536    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
8537      SDValue N0 = N->getOperand(0);
8538      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
8539          DAG.MaskedValueIsZero(N0.getOperand(0),
8540                                APInt::getHighBitsSet(32, 16)))
8541        return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1);
8542    }
8543  }
8544
8545  // Nothing to be done for scalar shifts.
8546  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8547  if (!VT.isVector() || !TLI.isTypeLegal(VT))
8548    return SDValue();
8549
8550  assert(ST->hasNEON() && "unexpected vector shift");
8551  int64_t Cnt;
8552
8553  switch (N->getOpcode()) {
8554  default: llvm_unreachable("unexpected shift opcode");
8555
8556  case ISD::SHL:
8557    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
8558      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
8559                         DAG.getConstant(Cnt, MVT::i32));
8560    break;
8561
8562  case ISD::SRA:
8563  case ISD::SRL:
8564    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
8565      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
8566                            ARMISD::VSHRs : ARMISD::VSHRu);
8567      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
8568                         DAG.getConstant(Cnt, MVT::i32));
8569    }
8570  }
8571  return SDValue();
8572}
8573
8574/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
8575/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
8576static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
8577                                    const ARMSubtarget *ST) {
8578  SDValue N0 = N->getOperand(0);
8579
8580  // Check for sign- and zero-extensions of vector extract operations of 8-
8581  // and 16-bit vector elements.  NEON supports these directly.  They are
8582  // handled during DAG combining because type legalization will promote them
8583  // to 32-bit types and it is messy to recognize the operations after that.
8584  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
8585    SDValue Vec = N0.getOperand(0);
8586    SDValue Lane = N0.getOperand(1);
8587    EVT VT = N->getValueType(0);
8588    EVT EltVT = N0.getValueType();
8589    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8590
8591    if (VT == MVT::i32 &&
8592        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
8593        TLI.isTypeLegal(Vec.getValueType()) &&
8594        isa<ConstantSDNode>(Lane)) {
8595
8596      unsigned Opc = 0;
8597      switch (N->getOpcode()) {
8598      default: llvm_unreachable("unexpected opcode");
8599      case ISD::SIGN_EXTEND:
8600        Opc = ARMISD::VGETLANEs;
8601        break;
8602      case ISD::ZERO_EXTEND:
8603      case ISD::ANY_EXTEND:
8604        Opc = ARMISD::VGETLANEu;
8605        break;
8606      }
8607      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
8608    }
8609  }
8610
8611  return SDValue();
8612}
8613
8614/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
8615/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
8616static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
8617                                       const ARMSubtarget *ST) {
8618  // If the target supports NEON, try to use vmax/vmin instructions for f32
8619  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
8620  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
8621  // a NaN; only do the transformation when it matches that behavior.
8622
8623  // For now only do this when using NEON for FP operations; if using VFP, it
8624  // is not obvious that the benefit outweighs the cost of switching to the
8625  // NEON pipeline.
8626  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
8627      N->getValueType(0) != MVT::f32)
8628    return SDValue();
8629
8630  SDValue CondLHS = N->getOperand(0);
8631  SDValue CondRHS = N->getOperand(1);
8632  SDValue LHS = N->getOperand(2);
8633  SDValue RHS = N->getOperand(3);
8634  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
8635
8636  unsigned Opcode = 0;
8637  bool IsReversed;
8638  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
8639    IsReversed = false; // x CC y ? x : y
8640  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
8641    IsReversed = true ; // x CC y ? y : x
8642  } else {
8643    return SDValue();
8644  }
8645
8646  bool IsUnordered;
8647  switch (CC) {
8648  default: break;
8649  case ISD::SETOLT:
8650  case ISD::SETOLE:
8651  case ISD::SETLT:
8652  case ISD::SETLE:
8653  case ISD::SETULT:
8654  case ISD::SETULE:
8655    // If LHS is NaN, an ordered comparison will be false and the result will
8656    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
8657    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
8658    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
8659    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
8660      break;
8661    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
8662    // will return -0, so vmin can only be used for unsafe math or if one of
8663    // the operands is known to be nonzero.
8664    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
8665        !DAG.getTarget().Options.UnsafeFPMath &&
8666        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
8667      break;
8668    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
8669    break;
8670
8671  case ISD::SETOGT:
8672  case ISD::SETOGE:
8673  case ISD::SETGT:
8674  case ISD::SETGE:
8675  case ISD::SETUGT:
8676  case ISD::SETUGE:
8677    // If LHS is NaN, an ordered comparison will be false and the result will
8678    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
8679    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
8680    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
8681    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
8682      break;
8683    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
8684    // will return +0, so vmax can only be used for unsafe math or if one of
8685    // the operands is known to be nonzero.
8686    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
8687        !DAG.getTarget().Options.UnsafeFPMath &&
8688        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
8689      break;
8690    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
8691    break;
8692  }
8693
8694  if (!Opcode)
8695    return SDValue();
8696  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
8697}
8698
8699/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
8700SDValue
8701ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
8702  SDValue Cmp = N->getOperand(4);
8703  if (Cmp.getOpcode() != ARMISD::CMPZ)
8704    // Only looking at EQ and NE cases.
8705    return SDValue();
8706
8707  EVT VT = N->getValueType(0);
8708  DebugLoc dl = N->getDebugLoc();
8709  SDValue LHS = Cmp.getOperand(0);
8710  SDValue RHS = Cmp.getOperand(1);
8711  SDValue FalseVal = N->getOperand(0);
8712  SDValue TrueVal = N->getOperand(1);
8713  SDValue ARMcc = N->getOperand(2);
8714  ARMCC::CondCodes CC =
8715    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
8716
8717  // Simplify
8718  //   mov     r1, r0
8719  //   cmp     r1, x
8720  //   mov     r0, y
8721  //   moveq   r0, x
8722  // to
8723  //   cmp     r0, x
8724  //   movne   r0, y
8725  //
8726  //   mov     r1, r0
8727  //   cmp     r1, x
8728  //   mov     r0, x
8729  //   movne   r0, y
8730  // to
8731  //   cmp     r0, x
8732  //   movne   r0, y
8733  /// FIXME: Turn this into a target neutral optimization?
8734  SDValue Res;
8735  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
8736    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
8737                      N->getOperand(3), Cmp);
8738  } else if (CC == ARMCC::EQ && TrueVal == RHS) {
8739    SDValue ARMcc;
8740    SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
8741    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
8742                      N->getOperand(3), NewCmp);
8743  }
8744
8745  if (Res.getNode()) {
8746    APInt KnownZero, KnownOne;
8747    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
8748    // Capture demanded bits information that would be otherwise lost.
8749    if (KnownZero == 0xfffffffe)
8750      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
8751                        DAG.getValueType(MVT::i1));
8752    else if (KnownZero == 0xffffff00)
8753      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
8754                        DAG.getValueType(MVT::i8));
8755    else if (KnownZero == 0xffff0000)
8756      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
8757                        DAG.getValueType(MVT::i16));
8758  }
8759
8760  return Res;
8761}
8762
8763SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
8764                                             DAGCombinerInfo &DCI) const {
8765  switch (N->getOpcode()) {
8766  default: break;
8767  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
8768  case ISD::SUB:        return PerformSUBCombine(N, DCI);
8769  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
8770  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
8771  case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
8772  case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
8773  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
8774  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
8775  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
8776  case ISD::STORE:      return PerformSTORECombine(N, DCI);
8777  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
8778  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
8779  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
8780  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
8781  case ISD::FP_TO_SINT:
8782  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
8783  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
8784  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
8785  case ISD::SHL:
8786  case ISD::SRA:
8787  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
8788  case ISD::SIGN_EXTEND:
8789  case ISD::ZERO_EXTEND:
8790  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
8791  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
8792  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
8793  case ARMISD::VLD2DUP:
8794  case ARMISD::VLD3DUP:
8795  case ARMISD::VLD4DUP:
8796    return CombineBaseUpdate(N, DCI);
8797  case ISD::INTRINSIC_VOID:
8798  case ISD::INTRINSIC_W_CHAIN:
8799    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
8800    case Intrinsic::arm_neon_vld1:
8801    case Intrinsic::arm_neon_vld2:
8802    case Intrinsic::arm_neon_vld3:
8803    case Intrinsic::arm_neon_vld4:
8804    case Intrinsic::arm_neon_vld2lane:
8805    case Intrinsic::arm_neon_vld3lane:
8806    case Intrinsic::arm_neon_vld4lane:
8807    case Intrinsic::arm_neon_vst1:
8808    case Intrinsic::arm_neon_vst2:
8809    case Intrinsic::arm_neon_vst3:
8810    case Intrinsic::arm_neon_vst4:
8811    case Intrinsic::arm_neon_vst2lane:
8812    case Intrinsic::arm_neon_vst3lane:
8813    case Intrinsic::arm_neon_vst4lane:
8814      return CombineBaseUpdate(N, DCI);
8815    default: break;
8816    }
8817    break;
8818  }
8819  return SDValue();
8820}
8821
8822bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
8823                                                          EVT VT) const {
8824  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
8825}
8826
8827bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
8828  if (!Subtarget->allowsUnalignedMem())
8829    return false;
8830
8831  switch (VT.getSimpleVT().SimpleTy) {
8832  default:
8833    return false;
8834  case MVT::i8:
8835  case MVT::i16:
8836  case MVT::i32:
8837    return true;
8838  case MVT::f64:
8839    return Subtarget->hasNEON();
8840  // FIXME: VLD1 etc with standard alignment is legal.
8841  }
8842}
8843
8844static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
8845                       unsigned AlignCheck) {
8846  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
8847          (DstAlign == 0 || DstAlign % AlignCheck == 0));
8848}
8849
8850EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
8851                                           unsigned DstAlign, unsigned SrcAlign,
8852                                           bool IsZeroVal,
8853                                           bool MemcpyStrSrc,
8854                                           MachineFunction &MF) const {
8855  const Function *F = MF.getFunction();
8856
8857  // See if we can use NEON instructions for this...
8858  if (IsZeroVal &&
8859      !F->hasFnAttr(Attribute::NoImplicitFloat) &&
8860      Subtarget->hasNEON()) {
8861    if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
8862      return MVT::v4i32;
8863    } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
8864      return MVT::v2i32;
8865    }
8866  }
8867
8868  // Lowering to i32/i16 if the size permits.
8869  if (Size >= 4) {
8870    return MVT::i32;
8871  } else if (Size >= 2) {
8872    return MVT::i16;
8873  }
8874
8875  // Let the target-independent logic figure it out.
8876  return MVT::Other;
8877}
8878
8879static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
8880  if (V < 0)
8881    return false;
8882
8883  unsigned Scale = 1;
8884  switch (VT.getSimpleVT().SimpleTy) {
8885  default: return false;
8886  case MVT::i1:
8887  case MVT::i8:
8888    // Scale == 1;
8889    break;
8890  case MVT::i16:
8891    // Scale == 2;
8892    Scale = 2;
8893    break;
8894  case MVT::i32:
8895    // Scale == 4;
8896    Scale = 4;
8897    break;
8898  }
8899
8900  if ((V & (Scale - 1)) != 0)
8901    return false;
8902  V /= Scale;
8903  return V == (V & ((1LL << 5) - 1));
8904}
8905
8906static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
8907                                      const ARMSubtarget *Subtarget) {
8908  bool isNeg = false;
8909  if (V < 0) {
8910    isNeg = true;
8911    V = - V;
8912  }
8913
8914  switch (VT.getSimpleVT().SimpleTy) {
8915  default: return false;
8916  case MVT::i1:
8917  case MVT::i8:
8918  case MVT::i16:
8919  case MVT::i32:
8920    // + imm12 or - imm8
8921    if (isNeg)
8922      return V == (V & ((1LL << 8) - 1));
8923    return V == (V & ((1LL << 12) - 1));
8924  case MVT::f32:
8925  case MVT::f64:
8926    // Same as ARM mode. FIXME: NEON?
8927    if (!Subtarget->hasVFP2())
8928      return false;
8929    if ((V & 3) != 0)
8930      return false;
8931    V >>= 2;
8932    return V == (V & ((1LL << 8) - 1));
8933  }
8934}
8935
8936/// isLegalAddressImmediate - Return true if the integer value can be used
8937/// as the offset of the target addressing mode for load / store of the
8938/// given type.
8939static bool isLegalAddressImmediate(int64_t V, EVT VT,
8940                                    const ARMSubtarget *Subtarget) {
8941  if (V == 0)
8942    return true;
8943
8944  if (!VT.isSimple())
8945    return false;
8946
8947  if (Subtarget->isThumb1Only())
8948    return isLegalT1AddressImmediate(V, VT);
8949  else if (Subtarget->isThumb2())
8950    return isLegalT2AddressImmediate(V, VT, Subtarget);
8951
8952  // ARM mode.
8953  if (V < 0)
8954    V = - V;
8955  switch (VT.getSimpleVT().SimpleTy) {
8956  default: return false;
8957  case MVT::i1:
8958  case MVT::i8:
8959  case MVT::i32:
8960    // +- imm12
8961    return V == (V & ((1LL << 12) - 1));
8962  case MVT::i16:
8963    // +- imm8
8964    return V == (V & ((1LL << 8) - 1));
8965  case MVT::f32:
8966  case MVT::f64:
8967    if (!Subtarget->hasVFP2()) // FIXME: NEON?
8968      return false;
8969    if ((V & 3) != 0)
8970      return false;
8971    V >>= 2;
8972    return V == (V & ((1LL << 8) - 1));
8973  }
8974}
8975
8976bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
8977                                                      EVT VT) const {
8978  int Scale = AM.Scale;
8979  if (Scale < 0)
8980    return false;
8981
8982  switch (VT.getSimpleVT().SimpleTy) {
8983  default: return false;
8984  case MVT::i1:
8985  case MVT::i8:
8986  case MVT::i16:
8987  case MVT::i32:
8988    if (Scale == 1)
8989      return true;
8990    // r + r << imm
8991    Scale = Scale & ~1;
8992    return Scale == 2 || Scale == 4 || Scale == 8;
8993  case MVT::i64:
8994    // r + r
8995    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
8996      return true;
8997    return false;
8998  case MVT::isVoid:
8999    // Note, we allow "void" uses (basically, uses that aren't loads or
9000    // stores), because arm allows folding a scale into many arithmetic
9001    // operations.  This should be made more precise and revisited later.
9002
9003    // Allow r << imm, but the imm has to be a multiple of two.
9004    if (Scale & 1) return false;
9005    return isPowerOf2_32(Scale);
9006  }
9007}
9008
9009/// isLegalAddressingMode - Return true if the addressing mode represented
9010/// by AM is legal for this target, for a load/store of the specified type.
9011bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
9012                                              Type *Ty) const {
9013  EVT VT = getValueType(Ty, true);
9014  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
9015    return false;
9016
9017  // Can never fold addr of global into load/store.
9018  if (AM.BaseGV)
9019    return false;
9020
9021  switch (AM.Scale) {
9022  case 0:  // no scale reg, must be "r+i" or "r", or "i".
9023    break;
9024  case 1:
9025    if (Subtarget->isThumb1Only())
9026      return false;
9027    // FALL THROUGH.
9028  default:
9029    // ARM doesn't support any R+R*scale+imm addr modes.
9030    if (AM.BaseOffs)
9031      return false;
9032
9033    if (!VT.isSimple())
9034      return false;
9035
9036    if (Subtarget->isThumb2())
9037      return isLegalT2ScaledAddressingMode(AM, VT);
9038
9039    int Scale = AM.Scale;
9040    switch (VT.getSimpleVT().SimpleTy) {
9041    default: return false;
9042    case MVT::i1:
9043    case MVT::i8:
9044    case MVT::i32:
9045      if (Scale < 0) Scale = -Scale;
9046      if (Scale == 1)
9047        return true;
9048      // r + r << imm
9049      return isPowerOf2_32(Scale & ~1);
9050    case MVT::i16:
9051    case MVT::i64:
9052      // r + r
9053      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
9054        return true;
9055      return false;
9056
9057    case MVT::isVoid:
9058      // Note, we allow "void" uses (basically, uses that aren't loads or
9059      // stores), because arm allows folding a scale into many arithmetic
9060      // operations.  This should be made more precise and revisited later.
9061
9062      // Allow r << imm, but the imm has to be a multiple of two.
9063      if (Scale & 1) return false;
9064      return isPowerOf2_32(Scale);
9065    }
9066  }
9067  return true;
9068}
9069
9070/// isLegalICmpImmediate - Return true if the specified immediate is legal
9071/// icmp immediate, that is the target has icmp instructions which can compare
9072/// a register against the immediate without having to materialize the
9073/// immediate into a register.
9074bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
9075  // Thumb2 and ARM modes can use cmn for negative immediates.
9076  if (!Subtarget->isThumb())
9077    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
9078  if (Subtarget->isThumb2())
9079    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
9080  // Thumb1 doesn't have cmn, and only 8-bit immediates.
9081  return Imm >= 0 && Imm <= 255;
9082}
9083
9084/// isLegalAddImmediate - Return true if the specified immediate is a legal add
9085/// *or sub* immediate, that is the target has add or sub instructions which can
9086/// add a register with the immediate without having to materialize the
9087/// immediate into a register.
9088bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
9089  // Same encoding for add/sub, just flip the sign.
9090  int64_t AbsImm = llvm::abs64(Imm);
9091  if (!Subtarget->isThumb())
9092    return ARM_AM::getSOImmVal(AbsImm) != -1;
9093  if (Subtarget->isThumb2())
9094    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
9095  // Thumb1 only has 8-bit unsigned immediate.
9096  return AbsImm >= 0 && AbsImm <= 255;
9097}
9098
9099static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
9100                                      bool isSEXTLoad, SDValue &Base,
9101                                      SDValue &Offset, bool &isInc,
9102                                      SelectionDAG &DAG) {
9103  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
9104    return false;
9105
9106  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
9107    // AddressingMode 3
9108    Base = Ptr->getOperand(0);
9109    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
9110      int RHSC = (int)RHS->getZExtValue();
9111      if (RHSC < 0 && RHSC > -256) {
9112        assert(Ptr->getOpcode() == ISD::ADD);
9113        isInc = false;
9114        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
9115        return true;
9116      }
9117    }
9118    isInc = (Ptr->getOpcode() == ISD::ADD);
9119    Offset = Ptr->getOperand(1);
9120    return true;
9121  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
9122    // AddressingMode 2
9123    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
9124      int RHSC = (int)RHS->getZExtValue();
9125      if (RHSC < 0 && RHSC > -0x1000) {
9126        assert(Ptr->getOpcode() == ISD::ADD);
9127        isInc = false;
9128        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
9129        Base = Ptr->getOperand(0);
9130        return true;
9131      }
9132    }
9133
9134    if (Ptr->getOpcode() == ISD::ADD) {
9135      isInc = true;
9136      ARM_AM::ShiftOpc ShOpcVal=
9137        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
9138      if (ShOpcVal != ARM_AM::no_shift) {
9139        Base = Ptr->getOperand(1);
9140        Offset = Ptr->getOperand(0);
9141      } else {
9142        Base = Ptr->getOperand(0);
9143        Offset = Ptr->getOperand(1);
9144      }
9145      return true;
9146    }
9147
9148    isInc = (Ptr->getOpcode() == ISD::ADD);
9149    Base = Ptr->getOperand(0);
9150    Offset = Ptr->getOperand(1);
9151    return true;
9152  }
9153
9154  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
9155  return false;
9156}
9157
9158static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
9159                                     bool isSEXTLoad, SDValue &Base,
9160                                     SDValue &Offset, bool &isInc,
9161                                     SelectionDAG &DAG) {
9162  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
9163    return false;
9164
9165  Base = Ptr->getOperand(0);
9166  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
9167    int RHSC = (int)RHS->getZExtValue();
9168    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
9169      assert(Ptr->getOpcode() == ISD::ADD);
9170      isInc = false;
9171      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
9172      return true;
9173    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
9174      isInc = Ptr->getOpcode() == ISD::ADD;
9175      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
9176      return true;
9177    }
9178  }
9179
9180  return false;
9181}
9182
9183/// getPreIndexedAddressParts - returns true by value, base pointer and
9184/// offset pointer and addressing mode by reference if the node's address
9185/// can be legally represented as pre-indexed load / store address.
9186bool
9187ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
9188                                             SDValue &Offset,
9189                                             ISD::MemIndexedMode &AM,
9190                                             SelectionDAG &DAG) const {
9191  if (Subtarget->isThumb1Only())
9192    return false;
9193
9194  EVT VT;
9195  SDValue Ptr;
9196  bool isSEXTLoad = false;
9197  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9198    Ptr = LD->getBasePtr();
9199    VT  = LD->getMemoryVT();
9200    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
9201  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
9202    Ptr = ST->getBasePtr();
9203    VT  = ST->getMemoryVT();
9204  } else
9205    return false;
9206
9207  bool isInc;
9208  bool isLegal = false;
9209  if (Subtarget->isThumb2())
9210    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
9211                                       Offset, isInc, DAG);
9212  else
9213    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
9214                                        Offset, isInc, DAG);
9215  if (!isLegal)
9216    return false;
9217
9218  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
9219  return true;
9220}
9221
9222/// getPostIndexedAddressParts - returns true by value, base pointer and
9223/// offset pointer and addressing mode by reference if this node can be
9224/// combined with a load / store to form a post-indexed load / store.
9225bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
9226                                                   SDValue &Base,
9227                                                   SDValue &Offset,
9228                                                   ISD::MemIndexedMode &AM,
9229                                                   SelectionDAG &DAG) const {
9230  if (Subtarget->isThumb1Only())
9231    return false;
9232
9233  EVT VT;
9234  SDValue Ptr;
9235  bool isSEXTLoad = false;
9236  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9237    VT  = LD->getMemoryVT();
9238    Ptr = LD->getBasePtr();
9239    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
9240  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
9241    VT  = ST->getMemoryVT();
9242    Ptr = ST->getBasePtr();
9243  } else
9244    return false;
9245
9246  bool isInc;
9247  bool isLegal = false;
9248  if (Subtarget->isThumb2())
9249    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
9250                                       isInc, DAG);
9251  else
9252    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
9253                                        isInc, DAG);
9254  if (!isLegal)
9255    return false;
9256
9257  if (Ptr != Base) {
9258    // Swap base ptr and offset to catch more post-index load / store when
9259    // it's legal. In Thumb2 mode, offset must be an immediate.
9260    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
9261        !Subtarget->isThumb2())
9262      std::swap(Base, Offset);
9263
9264    // Post-indexed load / store update the base pointer.
9265    if (Ptr != Base)
9266      return false;
9267  }
9268
9269  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
9270  return true;
9271}
9272
9273void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
9274                                                       APInt &KnownZero,
9275                                                       APInt &KnownOne,
9276                                                       const SelectionDAG &DAG,
9277                                                       unsigned Depth) const {
9278  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
9279  switch (Op.getOpcode()) {
9280  default: break;
9281  case ARMISD::CMOV: {
9282    // Bits are known zero/one if known on the LHS and RHS.
9283    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
9284    if (KnownZero == 0 && KnownOne == 0) return;
9285
9286    APInt KnownZeroRHS, KnownOneRHS;
9287    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
9288    KnownZero &= KnownZeroRHS;
9289    KnownOne  &= KnownOneRHS;
9290    return;
9291  }
9292  }
9293}
9294
9295//===----------------------------------------------------------------------===//
9296//                           ARM Inline Assembly Support
9297//===----------------------------------------------------------------------===//
9298
9299bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
9300  // Looking for "rev" which is V6+.
9301  if (!Subtarget->hasV6Ops())
9302    return false;
9303
9304  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
9305  std::string AsmStr = IA->getAsmString();
9306  SmallVector<StringRef, 4> AsmPieces;
9307  SplitString(AsmStr, AsmPieces, ";\n");
9308
9309  switch (AsmPieces.size()) {
9310  default: return false;
9311  case 1:
9312    AsmStr = AsmPieces[0];
9313    AsmPieces.clear();
9314    SplitString(AsmStr, AsmPieces, " \t,");
9315
9316    // rev $0, $1
9317    if (AsmPieces.size() == 3 &&
9318        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
9319        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
9320      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
9321      if (Ty && Ty->getBitWidth() == 32)
9322        return IntrinsicLowering::LowerToByteSwap(CI);
9323    }
9324    break;
9325  }
9326
9327  return false;
9328}
9329
9330/// getConstraintType - Given a constraint letter, return the type of
9331/// constraint it is for this target.
9332ARMTargetLowering::ConstraintType
9333ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
9334  if (Constraint.size() == 1) {
9335    switch (Constraint[0]) {
9336    default:  break;
9337    case 'l': return C_RegisterClass;
9338    case 'w': return C_RegisterClass;
9339    case 'h': return C_RegisterClass;
9340    case 'x': return C_RegisterClass;
9341    case 't': return C_RegisterClass;
9342    case 'j': return C_Other; // Constant for movw.
9343      // An address with a single base register. Due to the way we
9344      // currently handle addresses it is the same as an 'r' memory constraint.
9345    case 'Q': return C_Memory;
9346    }
9347  } else if (Constraint.size() == 2) {
9348    switch (Constraint[0]) {
9349    default: break;
9350    // All 'U+' constraints are addresses.
9351    case 'U': return C_Memory;
9352    }
9353  }
9354  return TargetLowering::getConstraintType(Constraint);
9355}
9356
9357/// Examine constraint type and operand type and determine a weight value.
9358/// This object must already have been set up with the operand type
9359/// and the current alternative constraint selected.
9360TargetLowering::ConstraintWeight
9361ARMTargetLowering::getSingleConstraintMatchWeight(
9362    AsmOperandInfo &info, const char *constraint) const {
9363  ConstraintWeight weight = CW_Invalid;
9364  Value *CallOperandVal = info.CallOperandVal;
9365    // If we don't have a value, we can't do a match,
9366    // but allow it at the lowest weight.
9367  if (CallOperandVal == NULL)
9368    return CW_Default;
9369  Type *type = CallOperandVal->getType();
9370  // Look at the constraint type.
9371  switch (*constraint) {
9372  default:
9373    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
9374    break;
9375  case 'l':
9376    if (type->isIntegerTy()) {
9377      if (Subtarget->isThumb())
9378        weight = CW_SpecificReg;
9379      else
9380        weight = CW_Register;
9381    }
9382    break;
9383  case 'w':
9384    if (type->isFloatingPointTy())
9385      weight = CW_Register;
9386    break;
9387  }
9388  return weight;
9389}
9390
9391typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
9392RCPair
9393ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
9394                                                EVT VT) const {
9395  if (Constraint.size() == 1) {
9396    // GCC ARM Constraint Letters
9397    switch (Constraint[0]) {
9398    case 'l': // Low regs or general regs.
9399      if (Subtarget->isThumb())
9400        return RCPair(0U, &ARM::tGPRRegClass);
9401      return RCPair(0U, &ARM::GPRRegClass);
9402    case 'h': // High regs or no regs.
9403      if (Subtarget->isThumb())
9404        return RCPair(0U, &ARM::hGPRRegClass);
9405      break;
9406    case 'r':
9407      return RCPair(0U, &ARM::GPRRegClass);
9408    case 'w':
9409      if (VT == MVT::f32)
9410        return RCPair(0U, &ARM::SPRRegClass);
9411      if (VT.getSizeInBits() == 64)
9412        return RCPair(0U, &ARM::DPRRegClass);
9413      if (VT.getSizeInBits() == 128)
9414        return RCPair(0U, &ARM::QPRRegClass);
9415      break;
9416    case 'x':
9417      if (VT == MVT::f32)
9418        return RCPair(0U, &ARM::SPR_8RegClass);
9419      if (VT.getSizeInBits() == 64)
9420        return RCPair(0U, &ARM::DPR_8RegClass);
9421      if (VT.getSizeInBits() == 128)
9422        return RCPair(0U, &ARM::QPR_8RegClass);
9423      break;
9424    case 't':
9425      if (VT == MVT::f32)
9426        return RCPair(0U, &ARM::SPRRegClass);
9427      break;
9428    }
9429  }
9430  if (StringRef("{cc}").equals_lower(Constraint))
9431    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
9432
9433  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
9434}
9435
9436/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9437/// vector.  If it is invalid, don't add anything to Ops.
9438void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
9439                                                     std::string &Constraint,
9440                                                     std::vector<SDValue>&Ops,
9441                                                     SelectionDAG &DAG) const {
9442  SDValue Result(0, 0);
9443
9444  // Currently only support length 1 constraints.
9445  if (Constraint.length() != 1) return;
9446
9447  char ConstraintLetter = Constraint[0];
9448  switch (ConstraintLetter) {
9449  default: break;
9450  case 'j':
9451  case 'I': case 'J': case 'K': case 'L':
9452  case 'M': case 'N': case 'O':
9453    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
9454    if (!C)
9455      return;
9456
9457    int64_t CVal64 = C->getSExtValue();
9458    int CVal = (int) CVal64;
9459    // None of these constraints allow values larger than 32 bits.  Check
9460    // that the value fits in an int.
9461    if (CVal != CVal64)
9462      return;
9463
9464    switch (ConstraintLetter) {
9465      case 'j':
9466        // Constant suitable for movw, must be between 0 and
9467        // 65535.
9468        if (Subtarget->hasV6T2Ops())
9469          if (CVal >= 0 && CVal <= 65535)
9470            break;
9471        return;
9472      case 'I':
9473        if (Subtarget->isThumb1Only()) {
9474          // This must be a constant between 0 and 255, for ADD
9475          // immediates.
9476          if (CVal >= 0 && CVal <= 255)
9477            break;
9478        } else if (Subtarget->isThumb2()) {
9479          // A constant that can be used as an immediate value in a
9480          // data-processing instruction.
9481          if (ARM_AM::getT2SOImmVal(CVal) != -1)
9482            break;
9483        } else {
9484          // A constant that can be used as an immediate value in a
9485          // data-processing instruction.
9486          if (ARM_AM::getSOImmVal(CVal) != -1)
9487            break;
9488        }
9489        return;
9490
9491      case 'J':
9492        if (Subtarget->isThumb()) {  // FIXME thumb2
9493          // This must be a constant between -255 and -1, for negated ADD
9494          // immediates. This can be used in GCC with an "n" modifier that
9495          // prints the negated value, for use with SUB instructions. It is
9496          // not useful otherwise but is implemented for compatibility.
9497          if (CVal >= -255 && CVal <= -1)
9498            break;
9499        } else {
9500          // This must be a constant between -4095 and 4095. It is not clear
9501          // what this constraint is intended for. Implemented for
9502          // compatibility with GCC.
9503          if (CVal >= -4095 && CVal <= 4095)
9504            break;
9505        }
9506        return;
9507
9508      case 'K':
9509        if (Subtarget->isThumb1Only()) {
9510          // A 32-bit value where only one byte has a nonzero value. Exclude
9511          // zero to match GCC. This constraint is used by GCC internally for
9512          // constants that can be loaded with a move/shift combination.
9513          // It is not useful otherwise but is implemented for compatibility.
9514          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
9515            break;
9516        } else if (Subtarget->isThumb2()) {
9517          // A constant whose bitwise inverse can be used as an immediate
9518          // value in a data-processing instruction. This can be used in GCC
9519          // with a "B" modifier that prints the inverted value, for use with
9520          // BIC and MVN instructions. It is not useful otherwise but is
9521          // implemented for compatibility.
9522          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
9523            break;
9524        } else {
9525          // A constant whose bitwise inverse can be used as an immediate
9526          // value in a data-processing instruction. This can be used in GCC
9527          // with a "B" modifier that prints the inverted value, for use with
9528          // BIC and MVN instructions. It is not useful otherwise but is
9529          // implemented for compatibility.
9530          if (ARM_AM::getSOImmVal(~CVal) != -1)
9531            break;
9532        }
9533        return;
9534
9535      case 'L':
9536        if (Subtarget->isThumb1Only()) {
9537          // This must be a constant between -7 and 7,
9538          // for 3-operand ADD/SUB immediate instructions.
9539          if (CVal >= -7 && CVal < 7)
9540            break;
9541        } else if (Subtarget->isThumb2()) {
9542          // A constant whose negation can be used as an immediate value in a
9543          // data-processing instruction. This can be used in GCC with an "n"
9544          // modifier that prints the negated value, for use with SUB
9545          // instructions. It is not useful otherwise but is implemented for
9546          // compatibility.
9547          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
9548            break;
9549        } else {
9550          // A constant whose negation can be used as an immediate value in a
9551          // data-processing instruction. This can be used in GCC with an "n"
9552          // modifier that prints the negated value, for use with SUB
9553          // instructions. It is not useful otherwise but is implemented for
9554          // compatibility.
9555          if (ARM_AM::getSOImmVal(-CVal) != -1)
9556            break;
9557        }
9558        return;
9559
9560      case 'M':
9561        if (Subtarget->isThumb()) { // FIXME thumb2
9562          // This must be a multiple of 4 between 0 and 1020, for
9563          // ADD sp + immediate.
9564          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
9565            break;
9566        } else {
9567          // A power of two or a constant between 0 and 32.  This is used in
9568          // GCC for the shift amount on shifted register operands, but it is
9569          // useful in general for any shift amounts.
9570          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
9571            break;
9572        }
9573        return;
9574
9575      case 'N':
9576        if (Subtarget->isThumb()) {  // FIXME thumb2
9577          // This must be a constant between 0 and 31, for shift amounts.
9578          if (CVal >= 0 && CVal <= 31)
9579            break;
9580        }
9581        return;
9582
9583      case 'O':
9584        if (Subtarget->isThumb()) {  // FIXME thumb2
9585          // This must be a multiple of 4 between -508 and 508, for
9586          // ADD/SUB sp = sp + immediate.
9587          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
9588            break;
9589        }
9590        return;
9591    }
9592    Result = DAG.getTargetConstant(CVal, Op.getValueType());
9593    break;
9594  }
9595
9596  if (Result.getNode()) {
9597    Ops.push_back(Result);
9598    return;
9599  }
9600  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
9601}
9602
9603bool
9604ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
9605  // The ARM target isn't yet aware of offsets.
9606  return false;
9607}
9608
9609bool ARM::isBitFieldInvertedMask(unsigned v) {
9610  if (v == 0xffffffff)
9611    return 0;
9612  // there can be 1's on either or both "outsides", all the "inside"
9613  // bits must be 0's
9614  unsigned int lsb = 0, msb = 31;
9615  while (v & (1 << msb)) --msb;
9616  while (v & (1 << lsb)) ++lsb;
9617  for (unsigned int i = lsb; i <= msb; ++i) {
9618    if (v & (1 << i))
9619      return 0;
9620  }
9621  return 1;
9622}
9623
9624/// isFPImmLegal - Returns true if the target can instruction select the
9625/// specified FP immediate natively. If false, the legalizer will
9626/// materialize the FP immediate as a load from a constant pool.
9627bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
9628  if (!Subtarget->hasVFP3())
9629    return false;
9630  if (VT == MVT::f32)
9631    return ARM_AM::getFP32Imm(Imm) != -1;
9632  if (VT == MVT::f64)
9633    return ARM_AM::getFP64Imm(Imm) != -1;
9634  return false;
9635}
9636
9637/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
9638/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
9639/// specified in the intrinsic calls.
9640bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
9641                                           const CallInst &I,
9642                                           unsigned Intrinsic) const {
9643  switch (Intrinsic) {
9644  case Intrinsic::arm_neon_vld1:
9645  case Intrinsic::arm_neon_vld2:
9646  case Intrinsic::arm_neon_vld3:
9647  case Intrinsic::arm_neon_vld4:
9648  case Intrinsic::arm_neon_vld2lane:
9649  case Intrinsic::arm_neon_vld3lane:
9650  case Intrinsic::arm_neon_vld4lane: {
9651    Info.opc = ISD::INTRINSIC_W_CHAIN;
9652    // Conservatively set memVT to the entire set of vectors loaded.
9653    uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8;
9654    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
9655    Info.ptrVal = I.getArgOperand(0);
9656    Info.offset = 0;
9657    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
9658    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
9659    Info.vol = false; // volatile loads with NEON intrinsics not supported
9660    Info.readMem = true;
9661    Info.writeMem = false;
9662    return true;
9663  }
9664  case Intrinsic::arm_neon_vst1:
9665  case Intrinsic::arm_neon_vst2:
9666  case Intrinsic::arm_neon_vst3:
9667  case Intrinsic::arm_neon_vst4:
9668  case Intrinsic::arm_neon_vst2lane:
9669  case Intrinsic::arm_neon_vst3lane:
9670  case Intrinsic::arm_neon_vst4lane: {
9671    Info.opc = ISD::INTRINSIC_VOID;
9672    // Conservatively set memVT to the entire set of vectors stored.
9673    unsigned NumElts = 0;
9674    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
9675      Type *ArgTy = I.getArgOperand(ArgI)->getType();
9676      if (!ArgTy->isVectorTy())
9677        break;
9678      NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
9679    }
9680    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
9681    Info.ptrVal = I.getArgOperand(0);
9682    Info.offset = 0;
9683    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
9684    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
9685    Info.vol = false; // volatile stores with NEON intrinsics not supported
9686    Info.readMem = false;
9687    Info.writeMem = true;
9688    return true;
9689  }
9690  case Intrinsic::arm_strexd: {
9691    Info.opc = ISD::INTRINSIC_W_CHAIN;
9692    Info.memVT = MVT::i64;
9693    Info.ptrVal = I.getArgOperand(2);
9694    Info.offset = 0;
9695    Info.align = 8;
9696    Info.vol = true;
9697    Info.readMem = false;
9698    Info.writeMem = true;
9699    return true;
9700  }
9701  case Intrinsic::arm_ldrexd: {
9702    Info.opc = ISD::INTRINSIC_W_CHAIN;
9703    Info.memVT = MVT::i64;
9704    Info.ptrVal = I.getArgOperand(0);
9705    Info.offset = 0;
9706    Info.align = 8;
9707    Info.vol = true;
9708    Info.readMem = true;
9709    Info.writeMem = false;
9710    return true;
9711  }
9712  default:
9713    break;
9714  }
9715
9716  return false;
9717}
9718