ARMISelLowering.cpp revision 5af60ce2a8d4dc820664c9dc5fbbcff428402c15
1//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that ARM uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "arm-isel"
16#include "ARM.h"
17#include "ARMAddressingModes.h"
18#include "ARMCallingConv.h"
19#include "ARMConstantPoolValue.h"
20#include "ARMISelLowering.h"
21#include "ARMMachineFunctionInfo.h"
22#include "ARMPerfectShuffle.h"
23#include "ARMRegisterInfo.h"
24#include "ARMSubtarget.h"
25#include "ARMTargetMachine.h"
26#include "ARMTargetObjectFile.h"
27#include "llvm/CallingConv.h"
28#include "llvm/Constants.h"
29#include "llvm/Function.h"
30#include "llvm/GlobalValue.h"
31#include "llvm/Instruction.h"
32#include "llvm/Instructions.h"
33#include "llvm/Intrinsics.h"
34#include "llvm/Type.h"
35#include "llvm/CodeGen/CallingConvLower.h"
36#include "llvm/CodeGen/IntrinsicLowering.h"
37#include "llvm/CodeGen/MachineBasicBlock.h"
38#include "llvm/CodeGen/MachineFrameInfo.h"
39#include "llvm/CodeGen/MachineFunction.h"
40#include "llvm/CodeGen/MachineInstrBuilder.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/PseudoSourceValue.h"
43#include "llvm/CodeGen/SelectionDAG.h"
44#include "llvm/MC/MCSectionMachO.h"
45#include "llvm/Target/TargetOptions.h"
46#include "llvm/ADT/VectorExtras.h"
47#include "llvm/ADT/StringExtras.h"
48#include "llvm/ADT/Statistic.h"
49#include "llvm/Support/CommandLine.h"
50#include "llvm/Support/ErrorHandling.h"
51#include "llvm/Support/MathExtras.h"
52#include "llvm/Support/raw_ostream.h"
53#include <sstream>
54using namespace llvm;
55
56STATISTIC(NumTailCalls, "Number of tail calls");
57STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
58
59// This option should go away when tail calls fully work.
60static cl::opt<bool>
61EnableARMTailCalls("arm-tail-calls", cl::Hidden,
62  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
63  cl::init(false));
64
65cl::opt<bool>
66EnableARMLongCalls("arm-long-calls", cl::Hidden,
67  cl::desc("Generate calls via indirect call instructions"),
68  cl::init(false));
69
70static cl::opt<bool>
71ARMInterworking("arm-interworking", cl::Hidden,
72  cl::desc("Enable / disable ARM interworking (for debugging only)"),
73  cl::init(true));
74
75void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
76                                       EVT PromotedBitwiseVT) {
77  if (VT != PromotedLdStVT) {
78    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
79    AddPromotedToType (ISD::LOAD, VT.getSimpleVT(),
80                       PromotedLdStVT.getSimpleVT());
81
82    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
83    AddPromotedToType (ISD::STORE, VT.getSimpleVT(),
84                       PromotedLdStVT.getSimpleVT());
85  }
86
87  EVT ElemTy = VT.getVectorElementType();
88  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
89    setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
90  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
91  if (ElemTy != MVT::i32) {
92    setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
93    setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
94    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand);
95    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand);
96  }
97  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
98  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
99  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
100  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Legal);
101  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
102  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
103  if (VT.isInteger()) {
104    setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
105    setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
106    setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
107    setLoadExtAction(ISD::SEXTLOAD, VT.getSimpleVT(), Expand);
108    setLoadExtAction(ISD::ZEXTLOAD, VT.getSimpleVT(), Expand);
109    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
110         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
111      setTruncStoreAction(VT.getSimpleVT(),
112                          (MVT::SimpleValueType)InnerVT, Expand);
113  }
114  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
115
116  // Promote all bit-wise operations.
117  if (VT.isInteger() && VT != PromotedBitwiseVT) {
118    setOperationAction(ISD::AND, VT.getSimpleVT(), Promote);
119    AddPromotedToType (ISD::AND, VT.getSimpleVT(),
120                       PromotedBitwiseVT.getSimpleVT());
121    setOperationAction(ISD::OR,  VT.getSimpleVT(), Promote);
122    AddPromotedToType (ISD::OR,  VT.getSimpleVT(),
123                       PromotedBitwiseVT.getSimpleVT());
124    setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote);
125    AddPromotedToType (ISD::XOR, VT.getSimpleVT(),
126                       PromotedBitwiseVT.getSimpleVT());
127  }
128
129  // Neon does not support vector divide/remainder operations.
130  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
131  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
132  setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand);
133  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
134  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
135  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
136}
137
138void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
139  addRegisterClass(VT, ARM::DPRRegisterClass);
140  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
141}
142
143void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
144  addRegisterClass(VT, ARM::QPRRegisterClass);
145  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
146}
147
148static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
149  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
150    return new TargetLoweringObjectFileMachO();
151
152  return new ARMElfTargetObjectFile();
153}
154
155ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
156    : TargetLowering(TM, createTLOF(TM)) {
157  Subtarget = &TM.getSubtarget<ARMSubtarget>();
158  RegInfo = TM.getRegisterInfo();
159  Itins = TM.getInstrItineraryData();
160
161  if (Subtarget->isTargetDarwin()) {
162    // Uses VFP for Thumb libfuncs if available.
163    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
164      // Single-precision floating-point arithmetic.
165      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
166      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
167      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
168      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
169
170      // Double-precision floating-point arithmetic.
171      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
172      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
173      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
174      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
175
176      // Single-precision comparisons.
177      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
178      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
179      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
180      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
181      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
182      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
183      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
184      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
185
186      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
187      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
188      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
189      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
190      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
191      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
192      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
193      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
194
195      // Double-precision comparisons.
196      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
197      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
198      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
199      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
200      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
201      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
202      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
203      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
204
205      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
206      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
207      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
208      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
209      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
210      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
211      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
212      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
213
214      // Floating-point to integer conversions.
215      // i64 conversions are done via library routines even when generating VFP
216      // instructions, so use the same ones.
217      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
218      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
219      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
220      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
221
222      // Conversions between floating types.
223      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
224      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
225
226      // Integer to floating-point conversions.
227      // i64 conversions are done via library routines even when generating VFP
228      // instructions, so use the same ones.
229      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
230      // e.g., __floatunsidf vs. __floatunssidfvfp.
231      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
232      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
233      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
234      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
235    }
236  }
237
238  // These libcalls are not available in 32-bit.
239  setLibcallName(RTLIB::SHL_I128, 0);
240  setLibcallName(RTLIB::SRL_I128, 0);
241  setLibcallName(RTLIB::SRA_I128, 0);
242
243  if (Subtarget->isAAPCS_ABI()) {
244    // Double-precision floating-point arithmetic helper functions
245    // RTABI chapter 4.1.2, Table 2
246    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
247    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
248    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
249    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
250    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
251    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
252    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
253    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
254
255    // Double-precision floating-point comparison helper functions
256    // RTABI chapter 4.1.2, Table 3
257    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
258    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
259    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
260    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
261    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
262    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
263    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
264    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
265    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
266    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
267    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
268    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
269    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
270    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
271    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
272    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
273    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
274    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
275    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
276    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
277    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
278    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
279    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
280    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
281
282    // Single-precision floating-point arithmetic helper functions
283    // RTABI chapter 4.1.2, Table 4
284    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
285    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
286    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
287    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
288    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
289    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
290    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
291    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
292
293    // Single-precision floating-point comparison helper functions
294    // RTABI chapter 4.1.2, Table 5
295    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
296    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
297    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
298    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
299    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
300    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
301    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
302    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
303    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
304    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
305    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
306    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
307    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
308    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
309    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
310    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
311    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
312    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
313    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
314    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
315    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
316    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
317    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
318    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
319
320    // Floating-point to integer conversions.
321    // RTABI chapter 4.1.2, Table 6
322    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
323    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
324    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
325    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
326    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
327    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
328    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
329    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
330    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
331    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
332    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
333    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
334    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
335    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
336    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
337    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
338
339    // Conversions between floating types.
340    // RTABI chapter 4.1.2, Table 7
341    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
342    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
343    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
344    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
345
346    // Integer to floating-point conversions.
347    // RTABI chapter 4.1.2, Table 8
348    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
349    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
350    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
351    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
352    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
353    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
354    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
355    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
356    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
357    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
358    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
359    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
360    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
361    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
362    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
363    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
364
365    // Long long helper functions
366    // RTABI chapter 4.2, Table 9
367    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
368    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
369    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
370    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
371    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
372    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
373    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
374    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
375    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
376    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
377    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
378    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
379
380    // Integer division functions
381    // RTABI chapter 4.3.1
382    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
383    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
384    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
385    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
386    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
387    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
388    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
389    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
390    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
391    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
392    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
393    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
394  }
395
396  if (HasDivModLibcall) {
397    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
398    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
399  }
400
401  if (Subtarget->isThumb1Only())
402    addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
403  else
404    addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
405  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
406    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
407    if (!Subtarget->isFPOnlySP())
408      addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
409
410    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
411  }
412
413  if (Subtarget->hasNEON()) {
414    addDRTypeForNEON(MVT::v2f32);
415    addDRTypeForNEON(MVT::v8i8);
416    addDRTypeForNEON(MVT::v4i16);
417    addDRTypeForNEON(MVT::v2i32);
418    addDRTypeForNEON(MVT::v1i64);
419
420    addQRTypeForNEON(MVT::v4f32);
421    addQRTypeForNEON(MVT::v2f64);
422    addQRTypeForNEON(MVT::v16i8);
423    addQRTypeForNEON(MVT::v8i16);
424    addQRTypeForNEON(MVT::v4i32);
425    addQRTypeForNEON(MVT::v2i64);
426
427    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
428    // neither Neon nor VFP support any arithmetic operations on it.
429    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
430    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
431    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
432    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
433    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
434    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
435    setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
436    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
437    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
438    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
439    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
440    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
441    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
442    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
443    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
444    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
445    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
446    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
447    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
448    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
449    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
450    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
451    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
452    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
453
454    setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
455
456    // Neon does not support some operations on v1i64 and v2i64 types.
457    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
458    // Custom handling for some quad-vector types to detect VMULL.
459    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
460    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
461    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
462    // Custom handling for some vector types to avoid expensive expansions
463    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
464    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
465    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
466    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
467    setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
468    setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
469    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
470    // a destination type that is wider than the source.
471    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
472    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
473
474    setTargetDAGCombine(ISD::INTRINSIC_VOID);
475    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
476    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
477    setTargetDAGCombine(ISD::SHL);
478    setTargetDAGCombine(ISD::SRL);
479    setTargetDAGCombine(ISD::SRA);
480    setTargetDAGCombine(ISD::SIGN_EXTEND);
481    setTargetDAGCombine(ISD::ZERO_EXTEND);
482    setTargetDAGCombine(ISD::ANY_EXTEND);
483    setTargetDAGCombine(ISD::SELECT_CC);
484    setTargetDAGCombine(ISD::BUILD_VECTOR);
485    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
486    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
487    setTargetDAGCombine(ISD::STORE);
488  }
489
490  computeRegisterProperties();
491
492  // ARM does not have f32 extending load.
493  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
494
495  // ARM does not have i1 sign extending load.
496  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
497
498  // ARM supports all 4 flavors of integer indexed load / store.
499  if (!Subtarget->isThumb1Only()) {
500    for (unsigned im = (unsigned)ISD::PRE_INC;
501         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
502      setIndexedLoadAction(im,  MVT::i1,  Legal);
503      setIndexedLoadAction(im,  MVT::i8,  Legal);
504      setIndexedLoadAction(im,  MVT::i16, Legal);
505      setIndexedLoadAction(im,  MVT::i32, Legal);
506      setIndexedStoreAction(im, MVT::i1,  Legal);
507      setIndexedStoreAction(im, MVT::i8,  Legal);
508      setIndexedStoreAction(im, MVT::i16, Legal);
509      setIndexedStoreAction(im, MVT::i32, Legal);
510    }
511  }
512
513  // i64 operation support.
514  if (Subtarget->isThumb1Only()) {
515    setOperationAction(ISD::MUL,     MVT::i64, Expand);
516    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
517    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
518    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
519    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
520  } else {
521    setOperationAction(ISD::MUL,     MVT::i64, Expand);
522    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
523    if (!Subtarget->hasV6Ops())
524      setOperationAction(ISD::MULHS, MVT::i32, Expand);
525  }
526  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
527  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
528  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
529  setOperationAction(ISD::SRL,       MVT::i64, Custom);
530  setOperationAction(ISD::SRA,       MVT::i64, Custom);
531
532  // ARM does not have ROTL.
533  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
534  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
535  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
536  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
537    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
538
539  // Only ARMv6 has BSWAP.
540  if (!Subtarget->hasV6Ops())
541    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
542
543  // These are expanded into libcalls.
544  if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
545    // v7M has a hardware divider
546    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
547    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
548  }
549  setOperationAction(ISD::SREM,  MVT::i32, Expand);
550  setOperationAction(ISD::UREM,  MVT::i32, Expand);
551  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
552  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
553
554  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
555  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
556  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
557  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
558  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
559
560  setOperationAction(ISD::TRAP, MVT::Other, Legal);
561
562  // Use the default implementation.
563  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
564  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
565  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
566  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
567  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
568  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
569  setOperationAction(ISD::EHSELECTION,        MVT::i32,   Expand);
570  setOperationAction(ISD::EXCEPTIONADDR,      MVT::i32,   Expand);
571  setExceptionPointerRegister(ARM::R0);
572  setExceptionSelectorRegister(ARM::R1);
573
574  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
575  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
576  // the default expansion.
577  if (Subtarget->hasDataBarrier() ||
578      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
579    // membarrier needs custom lowering; the rest are legal and handled
580    // normally.
581    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
582  } else {
583    // Set them all for expansion, which will force libcalls.
584    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
585    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
586    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
587    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
588    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
589    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
590    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
591    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
592    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
593    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
594    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
595    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
596    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
597    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
598    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
599    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
600    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
601    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
602    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
603    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
604    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
605    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
606    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
607    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
608    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
609    // Since the libcalls include locking, fold in the fences
610    setShouldFoldAtomicFences(true);
611  }
612  // 64-bit versions are always libcalls (for now)
613  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
614  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
615  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
616  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
617  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
618  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
619  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
620  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
621
622  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
623
624  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
625  if (!Subtarget->hasV6Ops()) {
626    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
627    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
628  }
629  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
630
631  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
632    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
633    // iff target supports vfp2.
634    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
635    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
636  }
637
638  // We want to custom lower some of our intrinsics.
639  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
640  if (Subtarget->isTargetDarwin()) {
641    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
642    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
643    setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom);
644  }
645
646  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
647  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
648  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
649  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
650  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
651  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
652  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
653  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
654  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
655
656  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
657  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
658  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
659  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
660  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
661
662  // We don't support sin/cos/fmod/copysign/pow
663  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
664  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
665  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
666  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
667  setOperationAction(ISD::FREM,      MVT::f64, Expand);
668  setOperationAction(ISD::FREM,      MVT::f32, Expand);
669  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
670    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
671    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
672  }
673  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
674  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
675
676  // Various VFP goodness
677  if (!UseSoftFloat && !Subtarget->isThumb1Only()) {
678    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
679    if (Subtarget->hasVFP2()) {
680      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
681      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
682      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
683      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
684    }
685    // Special handling for half-precision FP.
686    if (!Subtarget->hasFP16()) {
687      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
688      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
689    }
690  }
691
692  // We have target-specific dag combine patterns for the following nodes:
693  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
694  setTargetDAGCombine(ISD::ADD);
695  setTargetDAGCombine(ISD::SUB);
696  setTargetDAGCombine(ISD::MUL);
697
698  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON())
699    setTargetDAGCombine(ISD::OR);
700  if (Subtarget->hasNEON())
701    setTargetDAGCombine(ISD::AND);
702
703  setStackPointerRegisterToSaveRestore(ARM::SP);
704
705  if (UseSoftFloat || Subtarget->isThumb1Only() || !Subtarget->hasVFP2())
706    setSchedulingPreference(Sched::RegPressure);
707  else
708    setSchedulingPreference(Sched::Hybrid);
709
710  //// temporary - rewrite interface to use type
711  maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
712
713  // On ARM arguments smaller than 4 bytes are extended, so all arguments
714  // are at least 4 bytes aligned.
715  setMinStackArgumentAlignment(4);
716
717  benefitFromCodePlacementOpt = true;
718}
719
720// FIXME: It might make sense to define the representative register class as the
721// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
722// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
723// SPR's representative would be DPR_VFP2. This should work well if register
724// pressure tracking were modified such that a register use would increment the
725// pressure of the register class's representative and all of it's super
726// classes' representatives transitively. We have not implemented this because
727// of the difficulty prior to coalescing of modeling operand register classes
728// due to the common occurence of cross class copies and subregister insertions
729// and extractions.
730std::pair<const TargetRegisterClass*, uint8_t>
731ARMTargetLowering::findRepresentativeClass(EVT VT) const{
732  const TargetRegisterClass *RRC = 0;
733  uint8_t Cost = 1;
734  switch (VT.getSimpleVT().SimpleTy) {
735  default:
736    return TargetLowering::findRepresentativeClass(VT);
737  // Use DPR as representative register class for all floating point
738  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
739  // the cost is 1 for both f32 and f64.
740  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
741  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
742    RRC = ARM::DPRRegisterClass;
743    // When NEON is used for SP, only half of the register file is available
744    // because operations that define both SP and DP results will be constrained
745    // to the VFP2 class (D0-D15). We currently model this constraint prior to
746    // coalescing by double-counting the SP regs. See the FIXME above.
747    if (Subtarget->useNEONForSinglePrecisionFP())
748      Cost = 2;
749    break;
750  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
751  case MVT::v4f32: case MVT::v2f64:
752    RRC = ARM::DPRRegisterClass;
753    Cost = 2;
754    break;
755  case MVT::v4i64:
756    RRC = ARM::DPRRegisterClass;
757    Cost = 4;
758    break;
759  case MVT::v8i64:
760    RRC = ARM::DPRRegisterClass;
761    Cost = 8;
762    break;
763  }
764  return std::make_pair(RRC, Cost);
765}
766
767const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
768  switch (Opcode) {
769  default: return 0;
770  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
771  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
772  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
773  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
774  case ARMISD::CALL:          return "ARMISD::CALL";
775  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
776  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
777  case ARMISD::tCALL:         return "ARMISD::tCALL";
778  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
779  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
780  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
781  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
782  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
783  case ARMISD::CMP:           return "ARMISD::CMP";
784  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
785  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
786  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
787  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
788  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
789  case ARMISD::CMOV:          return "ARMISD::CMOV";
790
791  case ARMISD::RBIT:          return "ARMISD::RBIT";
792
793  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
794  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
795  case ARMISD::SITOF:         return "ARMISD::SITOF";
796  case ARMISD::UITOF:         return "ARMISD::UITOF";
797
798  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
799  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
800  case ARMISD::RRX:           return "ARMISD::RRX";
801
802  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
803  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
804
805  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
806  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
807  case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP";
808
809  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
810
811  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
812
813  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
814
815  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
816  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
817
818  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
819
820  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
821  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
822  case ARMISD::VCGE:          return "ARMISD::VCGE";
823  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
824  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
825  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
826  case ARMISD::VCGT:          return "ARMISD::VCGT";
827  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
828  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
829  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
830  case ARMISD::VTST:          return "ARMISD::VTST";
831
832  case ARMISD::VSHL:          return "ARMISD::VSHL";
833  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
834  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
835  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
836  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
837  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
838  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
839  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
840  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
841  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
842  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
843  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
844  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
845  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
846  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
847  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
848  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
849  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
850  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
851  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
852  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
853  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
854  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
855  case ARMISD::VDUP:          return "ARMISD::VDUP";
856  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
857  case ARMISD::VEXT:          return "ARMISD::VEXT";
858  case ARMISD::VREV64:        return "ARMISD::VREV64";
859  case ARMISD::VREV32:        return "ARMISD::VREV32";
860  case ARMISD::VREV16:        return "ARMISD::VREV16";
861  case ARMISD::VZIP:          return "ARMISD::VZIP";
862  case ARMISD::VUZP:          return "ARMISD::VUZP";
863  case ARMISD::VTRN:          return "ARMISD::VTRN";
864  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
865  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
866  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
867  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
868  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
869  case ARMISD::FMAX:          return "ARMISD::FMAX";
870  case ARMISD::FMIN:          return "ARMISD::FMIN";
871  case ARMISD::BFI:           return "ARMISD::BFI";
872  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
873  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
874  case ARMISD::VBSL:          return "ARMISD::VBSL";
875  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
876  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
877  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
878  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
879  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
880  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
881  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
882  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
883  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
884  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
885  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
886  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
887  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
888  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
889  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
890  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
891  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
892  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
893  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
894  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
895  }
896}
897
898/// getRegClassFor - Return the register class that should be used for the
899/// specified value type.
900TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
901  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
902  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
903  // load / store 4 to 8 consecutive D registers.
904  if (Subtarget->hasNEON()) {
905    if (VT == MVT::v4i64)
906      return ARM::QQPRRegisterClass;
907    else if (VT == MVT::v8i64)
908      return ARM::QQQQPRRegisterClass;
909  }
910  return TargetLowering::getRegClassFor(VT);
911}
912
913// Create a fast isel object.
914FastISel *
915ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
916  return ARM::createFastISel(funcInfo);
917}
918
919/// getFunctionAlignment - Return the Log2 alignment of this function.
920unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
921  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
922}
923
924/// getMaximalGlobalOffset - Returns the maximal possible offset which can
925/// be used for loads / stores from the global.
926unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
927  return (Subtarget->isThumb1Only() ? 127 : 4095);
928}
929
930Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
931  unsigned NumVals = N->getNumValues();
932  if (!NumVals)
933    return Sched::RegPressure;
934
935  for (unsigned i = 0; i != NumVals; ++i) {
936    EVT VT = N->getValueType(i);
937    if (VT == MVT::Glue || VT == MVT::Other)
938      continue;
939    if (VT.isFloatingPoint() || VT.isVector())
940      return Sched::Latency;
941  }
942
943  if (!N->isMachineOpcode())
944    return Sched::RegPressure;
945
946  // Load are scheduled for latency even if there instruction itinerary
947  // is not available.
948  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
949  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
950
951  if (TID.getNumDefs() == 0)
952    return Sched::RegPressure;
953  if (!Itins->isEmpty() &&
954      Itins->getOperandCycle(TID.getSchedClass(), 0) > 2)
955    return Sched::Latency;
956
957  return Sched::RegPressure;
958}
959
960//===----------------------------------------------------------------------===//
961// Lowering Code
962//===----------------------------------------------------------------------===//
963
964/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
965static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
966  switch (CC) {
967  default: llvm_unreachable("Unknown condition code!");
968  case ISD::SETNE:  return ARMCC::NE;
969  case ISD::SETEQ:  return ARMCC::EQ;
970  case ISD::SETGT:  return ARMCC::GT;
971  case ISD::SETGE:  return ARMCC::GE;
972  case ISD::SETLT:  return ARMCC::LT;
973  case ISD::SETLE:  return ARMCC::LE;
974  case ISD::SETUGT: return ARMCC::HI;
975  case ISD::SETUGE: return ARMCC::HS;
976  case ISD::SETULT: return ARMCC::LO;
977  case ISD::SETULE: return ARMCC::LS;
978  }
979}
980
981/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
982static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
983                        ARMCC::CondCodes &CondCode2) {
984  CondCode2 = ARMCC::AL;
985  switch (CC) {
986  default: llvm_unreachable("Unknown FP condition!");
987  case ISD::SETEQ:
988  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
989  case ISD::SETGT:
990  case ISD::SETOGT: CondCode = ARMCC::GT; break;
991  case ISD::SETGE:
992  case ISD::SETOGE: CondCode = ARMCC::GE; break;
993  case ISD::SETOLT: CondCode = ARMCC::MI; break;
994  case ISD::SETOLE: CondCode = ARMCC::LS; break;
995  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
996  case ISD::SETO:   CondCode = ARMCC::VC; break;
997  case ISD::SETUO:  CondCode = ARMCC::VS; break;
998  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
999  case ISD::SETUGT: CondCode = ARMCC::HI; break;
1000  case ISD::SETUGE: CondCode = ARMCC::PL; break;
1001  case ISD::SETLT:
1002  case ISD::SETULT: CondCode = ARMCC::LT; break;
1003  case ISD::SETLE:
1004  case ISD::SETULE: CondCode = ARMCC::LE; break;
1005  case ISD::SETNE:
1006  case ISD::SETUNE: CondCode = ARMCC::NE; break;
1007  }
1008}
1009
1010//===----------------------------------------------------------------------===//
1011//                      Calling Convention Implementation
1012//===----------------------------------------------------------------------===//
1013
1014#include "ARMGenCallingConv.inc"
1015
1016/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1017/// given CallingConvention value.
1018CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1019                                                 bool Return,
1020                                                 bool isVarArg) const {
1021  switch (CC) {
1022  default:
1023    llvm_unreachable("Unsupported calling convention");
1024  case CallingConv::Fast:
1025    if (Subtarget->hasVFP2() && !isVarArg) {
1026      if (!Subtarget->isAAPCS_ABI())
1027        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1028      // For AAPCS ABI targets, just use VFP variant of the calling convention.
1029      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1030    }
1031    // Fallthrough
1032  case CallingConv::C: {
1033    // Use target triple & subtarget features to do actual dispatch.
1034    if (!Subtarget->isAAPCS_ABI())
1035      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1036    else if (Subtarget->hasVFP2() &&
1037             FloatABIType == FloatABI::Hard && !isVarArg)
1038      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1039    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1040  }
1041  case CallingConv::ARM_AAPCS_VFP:
1042    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1043  case CallingConv::ARM_AAPCS:
1044    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1045  case CallingConv::ARM_APCS:
1046    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1047  }
1048}
1049
1050/// LowerCallResult - Lower the result values of a call into the
1051/// appropriate copies out of appropriate physical registers.
1052SDValue
1053ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1054                                   CallingConv::ID CallConv, bool isVarArg,
1055                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1056                                   DebugLoc dl, SelectionDAG &DAG,
1057                                   SmallVectorImpl<SDValue> &InVals) const {
1058
1059  // Assign locations to each value returned by this call.
1060  SmallVector<CCValAssign, 16> RVLocs;
1061  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1062                 RVLocs, *DAG.getContext());
1063  CCInfo.AnalyzeCallResult(Ins,
1064                           CCAssignFnForNode(CallConv, /* Return*/ true,
1065                                             isVarArg));
1066
1067  // Copy all of the result registers out of their specified physreg.
1068  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1069    CCValAssign VA = RVLocs[i];
1070
1071    SDValue Val;
1072    if (VA.needsCustom()) {
1073      // Handle f64 or half of a v2f64.
1074      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1075                                      InFlag);
1076      Chain = Lo.getValue(1);
1077      InFlag = Lo.getValue(2);
1078      VA = RVLocs[++i]; // skip ahead to next loc
1079      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1080                                      InFlag);
1081      Chain = Hi.getValue(1);
1082      InFlag = Hi.getValue(2);
1083      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1084
1085      if (VA.getLocVT() == MVT::v2f64) {
1086        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1087        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1088                          DAG.getConstant(0, MVT::i32));
1089
1090        VA = RVLocs[++i]; // skip ahead to next loc
1091        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1092        Chain = Lo.getValue(1);
1093        InFlag = Lo.getValue(2);
1094        VA = RVLocs[++i]; // skip ahead to next loc
1095        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1096        Chain = Hi.getValue(1);
1097        InFlag = Hi.getValue(2);
1098        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1099        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1100                          DAG.getConstant(1, MVT::i32));
1101      }
1102    } else {
1103      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1104                               InFlag);
1105      Chain = Val.getValue(1);
1106      InFlag = Val.getValue(2);
1107    }
1108
1109    switch (VA.getLocInfo()) {
1110    default: llvm_unreachable("Unknown loc info!");
1111    case CCValAssign::Full: break;
1112    case CCValAssign::BCvt:
1113      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1114      break;
1115    }
1116
1117    InVals.push_back(Val);
1118  }
1119
1120  return Chain;
1121}
1122
1123/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1124/// by "Src" to address "Dst" of size "Size".  Alignment information is
1125/// specified by the specific parameter attribute.  The copy will be passed as
1126/// a byval function parameter.
1127/// Sometimes what we are copying is the end of a larger object, the part that
1128/// does not fit in registers.
1129static SDValue
1130CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1131                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1132                          DebugLoc dl) {
1133  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1134  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1135                       /*isVolatile=*/false, /*AlwaysInline=*/false,
1136                       MachinePointerInfo(0), MachinePointerInfo(0));
1137}
1138
1139/// LowerMemOpCallTo - Store the argument to the stack.
1140SDValue
1141ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
1142                                    SDValue StackPtr, SDValue Arg,
1143                                    DebugLoc dl, SelectionDAG &DAG,
1144                                    const CCValAssign &VA,
1145                                    ISD::ArgFlagsTy Flags) const {
1146  unsigned LocMemOffset = VA.getLocMemOffset();
1147  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1148  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1149  if (Flags.isByVal())
1150    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1151
1152  return DAG.getStore(Chain, dl, Arg, PtrOff,
1153                      MachinePointerInfo::getStack(LocMemOffset),
1154                      false, false, 0);
1155}
1156
1157void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
1158                                         SDValue Chain, SDValue &Arg,
1159                                         RegsToPassVector &RegsToPass,
1160                                         CCValAssign &VA, CCValAssign &NextVA,
1161                                         SDValue &StackPtr,
1162                                         SmallVector<SDValue, 8> &MemOpChains,
1163                                         ISD::ArgFlagsTy Flags) const {
1164
1165  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1166                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
1167  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
1168
1169  if (NextVA.isRegLoc())
1170    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
1171  else {
1172    assert(NextVA.isMemLoc());
1173    if (StackPtr.getNode() == 0)
1174      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1175
1176    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
1177                                           dl, DAG, NextVA,
1178                                           Flags));
1179  }
1180}
1181
1182/// LowerCall - Lowering a call into a callseq_start <-
1183/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1184/// nodes.
1185SDValue
1186ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1187                             CallingConv::ID CallConv, bool isVarArg,
1188                             bool &isTailCall,
1189                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1190                             const SmallVectorImpl<SDValue> &OutVals,
1191                             const SmallVectorImpl<ISD::InputArg> &Ins,
1192                             DebugLoc dl, SelectionDAG &DAG,
1193                             SmallVectorImpl<SDValue> &InVals) const {
1194  MachineFunction &MF = DAG.getMachineFunction();
1195  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
1196  bool IsSibCall = false;
1197  // Temporarily disable tail calls so things don't break.
1198  if (!EnableARMTailCalls)
1199    isTailCall = false;
1200  if (isTailCall) {
1201    // Check if it's really possible to do a tail call.
1202    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1203                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1204                                                   Outs, OutVals, Ins, DAG);
1205    // We don't support GuaranteedTailCallOpt for ARM, only automatically
1206    // detected sibcalls.
1207    if (isTailCall) {
1208      ++NumTailCalls;
1209      IsSibCall = true;
1210    }
1211  }
1212
1213  // Analyze operands of the call, assigning locations to each operand.
1214  SmallVector<CCValAssign, 16> ArgLocs;
1215  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
1216                 *DAG.getContext());
1217  CCInfo.AnalyzeCallOperands(Outs,
1218                             CCAssignFnForNode(CallConv, /* Return*/ false,
1219                                               isVarArg));
1220
1221  // Get a count of how many bytes are to be pushed on the stack.
1222  unsigned NumBytes = CCInfo.getNextStackOffset();
1223
1224  // For tail calls, memory operands are available in our caller's stack.
1225  if (IsSibCall)
1226    NumBytes = 0;
1227
1228  // Adjust the stack pointer for the new arguments...
1229  // These operations are automatically eliminated by the prolog/epilog pass
1230  if (!IsSibCall)
1231    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1232
1233  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
1234
1235  RegsToPassVector RegsToPass;
1236  SmallVector<SDValue, 8> MemOpChains;
1237
1238  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1239  // of tail call optimization, arguments are handled later.
1240  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1241       i != e;
1242       ++i, ++realArgIdx) {
1243    CCValAssign &VA = ArgLocs[i];
1244    SDValue Arg = OutVals[realArgIdx];
1245    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1246    bool isByVal = Flags.isByVal();
1247
1248    // Promote the value if needed.
1249    switch (VA.getLocInfo()) {
1250    default: llvm_unreachable("Unknown loc info!");
1251    case CCValAssign::Full: break;
1252    case CCValAssign::SExt:
1253      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
1254      break;
1255    case CCValAssign::ZExt:
1256      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
1257      break;
1258    case CCValAssign::AExt:
1259      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
1260      break;
1261    case CCValAssign::BCvt:
1262      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1263      break;
1264    }
1265
1266    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
1267    if (VA.needsCustom()) {
1268      if (VA.getLocVT() == MVT::v2f64) {
1269        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1270                                  DAG.getConstant(0, MVT::i32));
1271        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1272                                  DAG.getConstant(1, MVT::i32));
1273
1274        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
1275                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1276
1277        VA = ArgLocs[++i]; // skip ahead to next loc
1278        if (VA.isRegLoc()) {
1279          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
1280                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
1281        } else {
1282          assert(VA.isMemLoc());
1283
1284          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
1285                                                 dl, DAG, VA, Flags));
1286        }
1287      } else {
1288        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
1289                         StackPtr, MemOpChains, Flags);
1290      }
1291    } else if (VA.isRegLoc()) {
1292      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1293    } else if (!IsSibCall || isByVal) {
1294      assert(VA.isMemLoc());
1295
1296      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1297                                             dl, DAG, VA, Flags));
1298    }
1299  }
1300
1301  if (!MemOpChains.empty())
1302    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1303                        &MemOpChains[0], MemOpChains.size());
1304
1305  // Build a sequence of copy-to-reg nodes chained together with token chain
1306  // and flag operands which copy the outgoing args into the appropriate regs.
1307  SDValue InFlag;
1308  // Tail call byval lowering might overwrite argument registers so in case of
1309  // tail call optimization the copies to registers are lowered later.
1310  if (!isTailCall)
1311    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1312      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1313                               RegsToPass[i].second, InFlag);
1314      InFlag = Chain.getValue(1);
1315    }
1316
1317  // For tail calls lower the arguments to the 'real' stack slot.
1318  if (isTailCall) {
1319    // Force all the incoming stack arguments to be loaded from the stack
1320    // before any new outgoing arguments are stored to the stack, because the
1321    // outgoing stack slots may alias the incoming argument stack slots, and
1322    // the alias isn't otherwise explicit. This is slightly more conservative
1323    // than necessary, because it means that each store effectively depends
1324    // on every argument instead of just those arguments it would clobber.
1325
1326    // Do not flag preceeding copytoreg stuff together with the following stuff.
1327    InFlag = SDValue();
1328    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1329      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1330                               RegsToPass[i].second, InFlag);
1331      InFlag = Chain.getValue(1);
1332    }
1333    InFlag =SDValue();
1334  }
1335
1336  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1337  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1338  // node so that legalize doesn't hack it.
1339  bool isDirect = false;
1340  bool isARMFunc = false;
1341  bool isLocalARMFunc = false;
1342  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1343
1344  if (EnableARMLongCalls) {
1345    assert (getTargetMachine().getRelocationModel() == Reloc::Static
1346            && "long-calls with non-static relocation model!");
1347    // Handle a global address or an external symbol. If it's not one of
1348    // those, the target's already in a register, so we don't need to do
1349    // anything extra.
1350    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1351      const GlobalValue *GV = G->getGlobal();
1352      // Create a constant pool entry for the callee address
1353      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1354      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
1355                                                           ARMPCLabelIndex,
1356                                                           ARMCP::CPValue, 0);
1357      // Get the address of the callee into a register
1358      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1359      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1360      Callee = DAG.getLoad(getPointerTy(), dl,
1361                           DAG.getEntryNode(), CPAddr,
1362                           MachinePointerInfo::getConstantPool(),
1363                           false, false, 0);
1364    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
1365      const char *Sym = S->getSymbol();
1366
1367      // Create a constant pool entry for the callee address
1368      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1369      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
1370                                                       Sym, ARMPCLabelIndex, 0);
1371      // Get the address of the callee into a register
1372      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1373      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1374      Callee = DAG.getLoad(getPointerTy(), dl,
1375                           DAG.getEntryNode(), CPAddr,
1376                           MachinePointerInfo::getConstantPool(),
1377                           false, false, 0);
1378    }
1379  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1380    const GlobalValue *GV = G->getGlobal();
1381    isDirect = true;
1382    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
1383    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
1384                   getTargetMachine().getRelocationModel() != Reloc::Static;
1385    isARMFunc = !Subtarget->isThumb() || isStub;
1386    // ARM call to a local ARM function is predicable.
1387    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
1388    // tBX takes a register source operand.
1389    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1390      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1391      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
1392                                                           ARMPCLabelIndex,
1393                                                           ARMCP::CPValue, 4);
1394      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1395      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1396      Callee = DAG.getLoad(getPointerTy(), dl,
1397                           DAG.getEntryNode(), CPAddr,
1398                           MachinePointerInfo::getConstantPool(),
1399                           false, false, 0);
1400      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1401      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1402                           getPointerTy(), Callee, PICLabel);
1403    } else {
1404      // On ELF targets for PIC code, direct calls should go through the PLT
1405      unsigned OpFlags = 0;
1406      if (Subtarget->isTargetELF() &&
1407                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
1408        OpFlags = ARMII::MO_PLT;
1409      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
1410    }
1411  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1412    isDirect = true;
1413    bool isStub = Subtarget->isTargetDarwin() &&
1414                  getTargetMachine().getRelocationModel() != Reloc::Static;
1415    isARMFunc = !Subtarget->isThumb() || isStub;
1416    // tBX takes a register source operand.
1417    const char *Sym = S->getSymbol();
1418    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
1419      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1420      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
1421                                                       Sym, ARMPCLabelIndex, 4);
1422      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
1423      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1424      Callee = DAG.getLoad(getPointerTy(), dl,
1425                           DAG.getEntryNode(), CPAddr,
1426                           MachinePointerInfo::getConstantPool(),
1427                           false, false, 0);
1428      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1429      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
1430                           getPointerTy(), Callee, PICLabel);
1431    } else {
1432      unsigned OpFlags = 0;
1433      // On ELF targets for PIC code, direct calls should go through the PLT
1434      if (Subtarget->isTargetELF() &&
1435                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
1436        OpFlags = ARMII::MO_PLT;
1437      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
1438    }
1439  }
1440
1441  // FIXME: handle tail calls differently.
1442  unsigned CallOpc;
1443  if (Subtarget->isThumb()) {
1444    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
1445      CallOpc = ARMISD::CALL_NOLINK;
1446    else
1447      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
1448  } else {
1449    CallOpc = (isDirect || Subtarget->hasV5TOps())
1450      ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
1451      : ARMISD::CALL_NOLINK;
1452  }
1453
1454  std::vector<SDValue> Ops;
1455  Ops.push_back(Chain);
1456  Ops.push_back(Callee);
1457
1458  // Add argument registers to the end of the list so that they are known live
1459  // into the call.
1460  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1461    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1462                                  RegsToPass[i].second.getValueType()));
1463
1464  if (InFlag.getNode())
1465    Ops.push_back(InFlag);
1466
1467  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1468  if (isTailCall)
1469    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
1470
1471  // Returns a chain and a flag for retval copy to use.
1472  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
1473  InFlag = Chain.getValue(1);
1474
1475  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
1476                             DAG.getIntPtrConstant(0, true), InFlag);
1477  if (!Ins.empty())
1478    InFlag = Chain.getValue(1);
1479
1480  // Handle result values, copying them out of physregs into vregs that we
1481  // return.
1482  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
1483                         dl, DAG, InVals);
1484}
1485
1486/// HandleByVal - Every parameter *after* a byval parameter is passed
1487/// on the stack.  Confiscate all the parameter registers to insure
1488/// this.
1489void
1490llvm::ARMTargetLowering::HandleByVal(CCState *State) const {
1491  static const unsigned RegList1[] = {
1492    ARM::R0, ARM::R1, ARM::R2, ARM::R3
1493  };
1494  do {} while (State->AllocateReg(RegList1, 4));
1495}
1496
1497/// MatchingStackOffset - Return true if the given stack call argument is
1498/// already available in the same position (relatively) of the caller's
1499/// incoming argument stack.
1500static
1501bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
1502                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
1503                         const ARMInstrInfo *TII) {
1504  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
1505  int FI = INT_MAX;
1506  if (Arg.getOpcode() == ISD::CopyFromReg) {
1507    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
1508    if (!TargetRegisterInfo::isVirtualRegister(VR))
1509      return false;
1510    MachineInstr *Def = MRI->getVRegDef(VR);
1511    if (!Def)
1512      return false;
1513    if (!Flags.isByVal()) {
1514      if (!TII->isLoadFromStackSlot(Def, FI))
1515        return false;
1516    } else {
1517      return false;
1518    }
1519  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
1520    if (Flags.isByVal())
1521      // ByVal argument is passed in as a pointer but it's now being
1522      // dereferenced. e.g.
1523      // define @foo(%struct.X* %A) {
1524      //   tail call @bar(%struct.X* byval %A)
1525      // }
1526      return false;
1527    SDValue Ptr = Ld->getBasePtr();
1528    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
1529    if (!FINode)
1530      return false;
1531    FI = FINode->getIndex();
1532  } else
1533    return false;
1534
1535  assert(FI != INT_MAX);
1536  if (!MFI->isFixedObjectIndex(FI))
1537    return false;
1538  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
1539}
1540
1541/// IsEligibleForTailCallOptimization - Check whether the call is eligible
1542/// for tail call optimization. Targets which want to do tail call
1543/// optimization should implement this function.
1544bool
1545ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
1546                                                     CallingConv::ID CalleeCC,
1547                                                     bool isVarArg,
1548                                                     bool isCalleeStructRet,
1549                                                     bool isCallerStructRet,
1550                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
1551                                    const SmallVectorImpl<SDValue> &OutVals,
1552                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1553                                                     SelectionDAG& DAG) const {
1554  const Function *CallerF = DAG.getMachineFunction().getFunction();
1555  CallingConv::ID CallerCC = CallerF->getCallingConv();
1556  bool CCMatch = CallerCC == CalleeCC;
1557
1558  // Look for obvious safe cases to perform tail call optimization that do not
1559  // require ABI changes. This is what gcc calls sibcall.
1560
1561  // Do not sibcall optimize vararg calls unless the call site is not passing
1562  // any arguments.
1563  if (isVarArg && !Outs.empty())
1564    return false;
1565
1566  // Also avoid sibcall optimization if either caller or callee uses struct
1567  // return semantics.
1568  if (isCalleeStructRet || isCallerStructRet)
1569    return false;
1570
1571  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
1572  // emitEpilogue is not ready for them.
1573  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
1574  // LR.  This means if we need to reload LR, it takes an extra instructions,
1575  // which outweighs the value of the tail call; but here we don't know yet
1576  // whether LR is going to be used.  Probably the right approach is to
1577  // generate the tail call here and turn it back into CALL/RET in
1578  // emitEpilogue if LR is used.
1579
1580  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
1581  // but we need to make sure there are enough registers; the only valid
1582  // registers are the 4 used for parameters.  We don't currently do this
1583  // case.
1584  if (Subtarget->isThumb1Only())
1585    return false;
1586
1587  // If the calling conventions do not match, then we'd better make sure the
1588  // results are returned in the same way as what the caller expects.
1589  if (!CCMatch) {
1590    SmallVector<CCValAssign, 16> RVLocs1;
1591    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
1592                    RVLocs1, *DAG.getContext());
1593    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
1594
1595    SmallVector<CCValAssign, 16> RVLocs2;
1596    CCState CCInfo2(CallerCC, false, getTargetMachine(),
1597                    RVLocs2, *DAG.getContext());
1598    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
1599
1600    if (RVLocs1.size() != RVLocs2.size())
1601      return false;
1602    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
1603      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
1604        return false;
1605      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
1606        return false;
1607      if (RVLocs1[i].isRegLoc()) {
1608        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
1609          return false;
1610      } else {
1611        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
1612          return false;
1613      }
1614    }
1615  }
1616
1617  // If the callee takes no arguments then go on to check the results of the
1618  // call.
1619  if (!Outs.empty()) {
1620    // Check if stack adjustment is needed. For now, do not do this if any
1621    // argument is passed on the stack.
1622    SmallVector<CCValAssign, 16> ArgLocs;
1623    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
1624                   ArgLocs, *DAG.getContext());
1625    CCInfo.AnalyzeCallOperands(Outs,
1626                               CCAssignFnForNode(CalleeCC, false, isVarArg));
1627    if (CCInfo.getNextStackOffset()) {
1628      MachineFunction &MF = DAG.getMachineFunction();
1629
1630      // Check if the arguments are already laid out in the right way as
1631      // the caller's fixed stack objects.
1632      MachineFrameInfo *MFI = MF.getFrameInfo();
1633      const MachineRegisterInfo *MRI = &MF.getRegInfo();
1634      const ARMInstrInfo *TII =
1635        ((ARMTargetMachine&)getTargetMachine()).getInstrInfo();
1636      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
1637           i != e;
1638           ++i, ++realArgIdx) {
1639        CCValAssign &VA = ArgLocs[i];
1640        EVT RegVT = VA.getLocVT();
1641        SDValue Arg = OutVals[realArgIdx];
1642        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
1643        if (VA.getLocInfo() == CCValAssign::Indirect)
1644          return false;
1645        if (VA.needsCustom()) {
1646          // f64 and vector types are split into multiple registers or
1647          // register/stack-slot combinations.  The types will not match
1648          // the registers; give up on memory f64 refs until we figure
1649          // out what to do about this.
1650          if (!VA.isRegLoc())
1651            return false;
1652          if (!ArgLocs[++i].isRegLoc())
1653            return false;
1654          if (RegVT == MVT::v2f64) {
1655            if (!ArgLocs[++i].isRegLoc())
1656              return false;
1657            if (!ArgLocs[++i].isRegLoc())
1658              return false;
1659          }
1660        } else if (!VA.isRegLoc()) {
1661          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
1662                                   MFI, MRI, TII))
1663            return false;
1664        }
1665      }
1666    }
1667  }
1668
1669  return true;
1670}
1671
1672SDValue
1673ARMTargetLowering::LowerReturn(SDValue Chain,
1674                               CallingConv::ID CallConv, bool isVarArg,
1675                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1676                               const SmallVectorImpl<SDValue> &OutVals,
1677                               DebugLoc dl, SelectionDAG &DAG) const {
1678
1679  // CCValAssign - represent the assignment of the return value to a location.
1680  SmallVector<CCValAssign, 16> RVLocs;
1681
1682  // CCState - Info about the registers and stack slots.
1683  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
1684                 *DAG.getContext());
1685
1686  // Analyze outgoing return values.
1687  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
1688                                               isVarArg));
1689
1690  // If this is the first return lowered for this function, add
1691  // the regs to the liveout set for the function.
1692  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1693    for (unsigned i = 0; i != RVLocs.size(); ++i)
1694      if (RVLocs[i].isRegLoc())
1695        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1696  }
1697
1698  SDValue Flag;
1699
1700  // Copy the result values into the output registers.
1701  for (unsigned i = 0, realRVLocIdx = 0;
1702       i != RVLocs.size();
1703       ++i, ++realRVLocIdx) {
1704    CCValAssign &VA = RVLocs[i];
1705    assert(VA.isRegLoc() && "Can only return in registers!");
1706
1707    SDValue Arg = OutVals[realRVLocIdx];
1708
1709    switch (VA.getLocInfo()) {
1710    default: llvm_unreachable("Unknown loc info!");
1711    case CCValAssign::Full: break;
1712    case CCValAssign::BCvt:
1713      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
1714      break;
1715    }
1716
1717    if (VA.needsCustom()) {
1718      if (VA.getLocVT() == MVT::v2f64) {
1719        // Extract the first half and return it in two registers.
1720        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1721                                   DAG.getConstant(0, MVT::i32));
1722        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
1723                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
1724
1725        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
1726        Flag = Chain.getValue(1);
1727        VA = RVLocs[++i]; // skip ahead to next loc
1728        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1729                                 HalfGPRs.getValue(1), Flag);
1730        Flag = Chain.getValue(1);
1731        VA = RVLocs[++i]; // skip ahead to next loc
1732
1733        // Extract the 2nd half and fall through to handle it as an f64 value.
1734        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1735                          DAG.getConstant(1, MVT::i32));
1736      }
1737      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
1738      // available.
1739      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1740                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
1741      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
1742      Flag = Chain.getValue(1);
1743      VA = RVLocs[++i]; // skip ahead to next loc
1744      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
1745                               Flag);
1746    } else
1747      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
1748
1749    // Guarantee that all emitted copies are
1750    // stuck together, avoiding something bad.
1751    Flag = Chain.getValue(1);
1752  }
1753
1754  SDValue result;
1755  if (Flag.getNode())
1756    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1757  else // Return Void
1758    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
1759
1760  return result;
1761}
1762
1763bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const {
1764  if (N->getNumValues() != 1)
1765    return false;
1766  if (!N->hasNUsesOfValue(1, 0))
1767    return false;
1768
1769  unsigned NumCopies = 0;
1770  SDNode* Copies[2];
1771  SDNode *Use = *N->use_begin();
1772  if (Use->getOpcode() == ISD::CopyToReg) {
1773    Copies[NumCopies++] = Use;
1774  } else if (Use->getOpcode() == ARMISD::VMOVRRD) {
1775    // f64 returned in a pair of GPRs.
1776    for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end();
1777         UI != UE; ++UI) {
1778      if (UI->getOpcode() != ISD::CopyToReg)
1779        return false;
1780      Copies[UI.getUse().getResNo()] = *UI;
1781      ++NumCopies;
1782    }
1783  } else if (Use->getOpcode() == ISD::BITCAST) {
1784    // f32 returned in a single GPR.
1785    if (!Use->hasNUsesOfValue(1, 0))
1786      return false;
1787    Use = *Use->use_begin();
1788    if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0))
1789      return false;
1790    Copies[NumCopies++] = Use;
1791  } else {
1792    return false;
1793  }
1794
1795  if (NumCopies != 1 && NumCopies != 2)
1796    return false;
1797
1798  bool HasRet = false;
1799  for (unsigned i = 0; i < NumCopies; ++i) {
1800    SDNode *Copy = Copies[i];
1801    for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
1802         UI != UE; ++UI) {
1803      if (UI->getOpcode() == ISD::CopyToReg) {
1804        SDNode *Use = *UI;
1805        if (Use == Copies[0] || Use == Copies[1])
1806          continue;
1807        return false;
1808      }
1809      if (UI->getOpcode() != ARMISD::RET_FLAG)
1810        return false;
1811      HasRet = true;
1812    }
1813  }
1814
1815  return HasRet;
1816}
1817
1818bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
1819  if (!EnableARMTailCalls)
1820    return false;
1821
1822  if (!CI->isTailCall())
1823    return false;
1824
1825  return !Subtarget->isThumb1Only();
1826}
1827
1828// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
1829// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
1830// one of the above mentioned nodes. It has to be wrapped because otherwise
1831// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
1832// be used to form addressing mode. These wrapped nodes will be selected
1833// into MOVi.
1834static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
1835  EVT PtrVT = Op.getValueType();
1836  // FIXME there is no actual debug info here
1837  DebugLoc dl = Op.getDebugLoc();
1838  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
1839  SDValue Res;
1840  if (CP->isMachineConstantPoolEntry())
1841    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
1842                                    CP->getAlignment());
1843  else
1844    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
1845                                    CP->getAlignment());
1846  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
1847}
1848
1849unsigned ARMTargetLowering::getJumpTableEncoding() const {
1850  return MachineJumpTableInfo::EK_Inline;
1851}
1852
1853SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
1854                                             SelectionDAG &DAG) const {
1855  MachineFunction &MF = DAG.getMachineFunction();
1856  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1857  unsigned ARMPCLabelIndex = 0;
1858  DebugLoc DL = Op.getDebugLoc();
1859  EVT PtrVT = getPointerTy();
1860  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
1861  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
1862  SDValue CPAddr;
1863  if (RelocM == Reloc::Static) {
1864    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
1865  } else {
1866    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
1867    ARMPCLabelIndex = AFI->createPICLabelUId();
1868    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
1869                                                         ARMCP::CPBlockAddress,
1870                                                         PCAdj);
1871    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
1872  }
1873  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
1874  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
1875                               MachinePointerInfo::getConstantPool(),
1876                               false, false, 0);
1877  if (RelocM == Reloc::Static)
1878    return Result;
1879  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1880  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
1881}
1882
1883// Lower ISD::GlobalTLSAddress using the "general dynamic" model
1884SDValue
1885ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
1886                                                 SelectionDAG &DAG) const {
1887  DebugLoc dl = GA->getDebugLoc();
1888  EVT PtrVT = getPointerTy();
1889  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
1890  MachineFunction &MF = DAG.getMachineFunction();
1891  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1892  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1893  ARMConstantPoolValue *CPV =
1894    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
1895                             ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
1896  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
1897  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
1898  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
1899                         MachinePointerInfo::getConstantPool(),
1900                         false, false, 0);
1901  SDValue Chain = Argument.getValue(1);
1902
1903  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1904  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
1905
1906  // call __tls_get_addr.
1907  ArgListTy Args;
1908  ArgListEntry Entry;
1909  Entry.Node = Argument;
1910  Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
1911  Args.push_back(Entry);
1912  // FIXME: is there useful debug info available here?
1913  std::pair<SDValue, SDValue> CallResult =
1914    LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
1915                false, false, false, false,
1916                0, CallingConv::C, false, /*isReturnValueUsed=*/true,
1917                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
1918  return CallResult.first;
1919}
1920
1921// Lower ISD::GlobalTLSAddress using the "initial exec" or
1922// "local exec" model.
1923SDValue
1924ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
1925                                        SelectionDAG &DAG) const {
1926  const GlobalValue *GV = GA->getGlobal();
1927  DebugLoc dl = GA->getDebugLoc();
1928  SDValue Offset;
1929  SDValue Chain = DAG.getEntryNode();
1930  EVT PtrVT = getPointerTy();
1931  // Get the Thread Pointer
1932  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
1933
1934  if (GV->isDeclaration()) {
1935    MachineFunction &MF = DAG.getMachineFunction();
1936    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
1937    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
1938    // Initial exec model.
1939    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
1940    ARMConstantPoolValue *CPV =
1941      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
1942                               ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true);
1943    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
1944    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
1945    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
1946                         MachinePointerInfo::getConstantPool(),
1947                         false, false, 0);
1948    Chain = Offset.getValue(1);
1949
1950    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
1951    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
1952
1953    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
1954                         MachinePointerInfo::getConstantPool(),
1955                         false, false, 0);
1956  } else {
1957    // local exec model
1958    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF);
1959    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
1960    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
1961    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
1962                         MachinePointerInfo::getConstantPool(),
1963                         false, false, 0);
1964  }
1965
1966  // The address of the thread local variable is the add of the thread
1967  // pointer with the offset of the variable.
1968  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
1969}
1970
1971SDValue
1972ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
1973  // TODO: implement the "local dynamic" model
1974  assert(Subtarget->isTargetELF() &&
1975         "TLS not implemented for non-ELF targets");
1976  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
1977  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
1978  // otherwise use the "Local Exec" TLS Model
1979  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
1980    return LowerToTLSGeneralDynamicModel(GA, DAG);
1981  else
1982    return LowerToTLSExecModels(GA, DAG);
1983}
1984
1985SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
1986                                                 SelectionDAG &DAG) const {
1987  EVT PtrVT = getPointerTy();
1988  DebugLoc dl = Op.getDebugLoc();
1989  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
1990  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
1991  if (RelocM == Reloc::PIC_) {
1992    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
1993    ARMConstantPoolValue *CPV =
1994      new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
1995    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
1996    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
1997    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
1998                                 CPAddr,
1999                                 MachinePointerInfo::getConstantPool(),
2000                                 false, false, 0);
2001    SDValue Chain = Result.getValue(1);
2002    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
2003    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
2004    if (!UseGOTOFF)
2005      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
2006                           MachinePointerInfo::getGOT(), false, false, 0);
2007    return Result;
2008  }
2009
2010  // If we have T2 ops, we can materialize the address directly via movt/movw
2011  // pair. This is always cheaper.
2012  if (Subtarget->useMovt()) {
2013    ++NumMovwMovt;
2014    // FIXME: Once remat is capable of dealing with instructions with register
2015    // operands, expand this into two nodes.
2016    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2017                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2018  } else {
2019    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2020    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2021    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2022                       MachinePointerInfo::getConstantPool(),
2023                       false, false, 0);
2024  }
2025}
2026
2027SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
2028                                                    SelectionDAG &DAG) const {
2029  EVT PtrVT = getPointerTy();
2030  DebugLoc dl = Op.getDebugLoc();
2031  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
2032  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2033  MachineFunction &MF = DAG.getMachineFunction();
2034  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2035
2036  if (Subtarget->useMovt()) {
2037    ++NumMovwMovt;
2038    // FIXME: Once remat is capable of dealing with instructions with register
2039    // operands, expand this into two nodes.
2040    if (RelocM == Reloc::Static)
2041      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
2042                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2043
2044    unsigned Wrapper = (RelocM == Reloc::PIC_)
2045      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
2046    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
2047                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
2048    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2049      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
2050                           MachinePointerInfo::getGOT(), false, false, 0);
2051    return Result;
2052  }
2053
2054  unsigned ARMPCLabelIndex = 0;
2055  SDValue CPAddr;
2056  if (RelocM == Reloc::Static) {
2057    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
2058  } else {
2059    ARMPCLabelIndex = AFI->createPICLabelUId();
2060    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
2061    ARMConstantPoolValue *CPV =
2062      new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
2063    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2064  }
2065  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2066
2067  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2068                               MachinePointerInfo::getConstantPool(),
2069                               false, false, 0);
2070  SDValue Chain = Result.getValue(1);
2071
2072  if (RelocM == Reloc::PIC_) {
2073    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2074    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2075  }
2076
2077  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
2078    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
2079                         false, false, 0);
2080
2081  return Result;
2082}
2083
2084SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
2085                                                    SelectionDAG &DAG) const {
2086  assert(Subtarget->isTargetELF() &&
2087         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
2088  MachineFunction &MF = DAG.getMachineFunction();
2089  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2090  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2091  EVT PtrVT = getPointerTy();
2092  DebugLoc dl = Op.getDebugLoc();
2093  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2094  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
2095                                                       "_GLOBAL_OFFSET_TABLE_",
2096                                                       ARMPCLabelIndex, PCAdj);
2097  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2098  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2099  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2100                               MachinePointerInfo::getConstantPool(),
2101                               false, false, 0);
2102  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2103  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2104}
2105
2106SDValue
2107ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
2108  const {
2109  DebugLoc dl = Op.getDebugLoc();
2110  return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
2111                     Op.getOperand(0));
2112}
2113
2114SDValue
2115ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
2116  DebugLoc dl = Op.getDebugLoc();
2117  SDValue Val = DAG.getConstant(0, MVT::i32);
2118  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
2119                     Op.getOperand(1), Val);
2120}
2121
2122SDValue
2123ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
2124  DebugLoc dl = Op.getDebugLoc();
2125  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
2126                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
2127}
2128
2129SDValue
2130ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
2131                                          const ARMSubtarget *Subtarget) const {
2132  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2133  DebugLoc dl = Op.getDebugLoc();
2134  switch (IntNo) {
2135  default: return SDValue();    // Don't custom lower most intrinsics.
2136  case Intrinsic::arm_thread_pointer: {
2137    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2138    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
2139  }
2140  case Intrinsic::eh_sjlj_lsda: {
2141    MachineFunction &MF = DAG.getMachineFunction();
2142    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2143    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2144    EVT PtrVT = getPointerTy();
2145    DebugLoc dl = Op.getDebugLoc();
2146    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
2147    SDValue CPAddr;
2148    unsigned PCAdj = (RelocM != Reloc::PIC_)
2149      ? 0 : (Subtarget->isThumb() ? 4 : 8);
2150    ARMConstantPoolValue *CPV =
2151      new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
2152                               ARMCP::CPLSDA, PCAdj);
2153    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2154    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2155    SDValue Result =
2156      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
2157                  MachinePointerInfo::getConstantPool(),
2158                  false, false, 0);
2159
2160    if (RelocM == Reloc::PIC_) {
2161      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
2162      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
2163    }
2164    return Result;
2165  }
2166  case Intrinsic::arm_neon_vmulls:
2167  case Intrinsic::arm_neon_vmullu: {
2168    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
2169      ? ARMISD::VMULLs : ARMISD::VMULLu;
2170    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
2171                       Op.getOperand(1), Op.getOperand(2));
2172  }
2173  }
2174}
2175
2176static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
2177                               const ARMSubtarget *Subtarget) {
2178  DebugLoc dl = Op.getDebugLoc();
2179  if (!Subtarget->hasDataBarrier()) {
2180    // Some ARMv6 cpus can support data barriers with an mcr instruction.
2181    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
2182    // here.
2183    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
2184           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
2185    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
2186                       DAG.getConstant(0, MVT::i32));
2187  }
2188
2189  SDValue Op5 = Op.getOperand(5);
2190  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
2191  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2192  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2193  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
2194
2195  ARM_MB::MemBOpt DMBOpt;
2196  if (isDeviceBarrier)
2197    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
2198  else
2199    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
2200  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
2201                     DAG.getConstant(DMBOpt, MVT::i32));
2202}
2203
2204static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
2205                             const ARMSubtarget *Subtarget) {
2206  // ARM pre v5TE and Thumb1 does not have preload instructions.
2207  if (!(Subtarget->isThumb2() ||
2208        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
2209    // Just preserve the chain.
2210    return Op.getOperand(0);
2211
2212  DebugLoc dl = Op.getDebugLoc();
2213  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
2214  if (!isRead &&
2215      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
2216    // ARMv7 with MP extension has PLDW.
2217    return Op.getOperand(0);
2218
2219  if (Subtarget->isThumb())
2220    // Invert the bits.
2221    isRead = ~isRead & 1;
2222  unsigned isData = Subtarget->isThumb() ? 0 : 1;
2223
2224  // Currently there is no intrinsic that matches pli.
2225  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
2226                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
2227                     DAG.getConstant(isData, MVT::i32));
2228}
2229
2230static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
2231  MachineFunction &MF = DAG.getMachineFunction();
2232  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
2233
2234  // vastart just stores the address of the VarArgsFrameIndex slot into the
2235  // memory location argument.
2236  DebugLoc dl = Op.getDebugLoc();
2237  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2238  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
2239  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2240  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
2241                      MachinePointerInfo(SV), false, false, 0);
2242}
2243
2244SDValue
2245ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
2246                                        SDValue &Root, SelectionDAG &DAG,
2247                                        DebugLoc dl) const {
2248  MachineFunction &MF = DAG.getMachineFunction();
2249  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2250
2251  TargetRegisterClass *RC;
2252  if (AFI->isThumb1OnlyFunction())
2253    RC = ARM::tGPRRegisterClass;
2254  else
2255    RC = ARM::GPRRegisterClass;
2256
2257  // Transform the arguments stored in physical registers into virtual ones.
2258  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2259  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2260
2261  SDValue ArgValue2;
2262  if (NextVA.isMemLoc()) {
2263    MachineFrameInfo *MFI = MF.getFrameInfo();
2264    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
2265
2266    // Create load node to retrieve arguments from the stack.
2267    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2268    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
2269                            MachinePointerInfo::getFixedStack(FI),
2270                            false, false, 0);
2271  } else {
2272    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2273    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
2274  }
2275
2276  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
2277}
2278
2279SDValue
2280ARMTargetLowering::LowerFormalArguments(SDValue Chain,
2281                                        CallingConv::ID CallConv, bool isVarArg,
2282                                        const SmallVectorImpl<ISD::InputArg>
2283                                          &Ins,
2284                                        DebugLoc dl, SelectionDAG &DAG,
2285                                        SmallVectorImpl<SDValue> &InVals)
2286                                          const {
2287
2288  MachineFunction &MF = DAG.getMachineFunction();
2289  MachineFrameInfo *MFI = MF.getFrameInfo();
2290
2291  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2292
2293  // Assign locations to all of the incoming arguments.
2294  SmallVector<CCValAssign, 16> ArgLocs;
2295  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
2296                 *DAG.getContext());
2297  CCInfo.AnalyzeFormalArguments(Ins,
2298                                CCAssignFnForNode(CallConv, /* Return*/ false,
2299                                                  isVarArg));
2300
2301  SmallVector<SDValue, 16> ArgValues;
2302  int lastInsIndex = -1;
2303
2304  SDValue ArgValue;
2305  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2306    CCValAssign &VA = ArgLocs[i];
2307
2308    // Arguments stored in registers.
2309    if (VA.isRegLoc()) {
2310      EVT RegVT = VA.getLocVT();
2311
2312      if (VA.needsCustom()) {
2313        // f64 and vector types are split up into multiple registers or
2314        // combinations of registers and stack slots.
2315        if (VA.getLocVT() == MVT::v2f64) {
2316          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
2317                                                   Chain, DAG, dl);
2318          VA = ArgLocs[++i]; // skip ahead to next loc
2319          SDValue ArgValue2;
2320          if (VA.isMemLoc()) {
2321            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
2322            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2323            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
2324                                    MachinePointerInfo::getFixedStack(FI),
2325                                    false, false, 0);
2326          } else {
2327            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
2328                                             Chain, DAG, dl);
2329          }
2330          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2331          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2332                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
2333          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
2334                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
2335        } else
2336          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
2337
2338      } else {
2339        TargetRegisterClass *RC;
2340
2341        if (RegVT == MVT::f32)
2342          RC = ARM::SPRRegisterClass;
2343        else if (RegVT == MVT::f64)
2344          RC = ARM::DPRRegisterClass;
2345        else if (RegVT == MVT::v2f64)
2346          RC = ARM::QPRRegisterClass;
2347        else if (RegVT == MVT::i32)
2348          RC = (AFI->isThumb1OnlyFunction() ?
2349                ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
2350        else
2351          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2352
2353        // Transform the arguments in physical registers into virtual ones.
2354        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2355        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2356      }
2357
2358      // If this is an 8 or 16-bit value, it is really passed promoted
2359      // to 32 bits.  Insert an assert[sz]ext to capture this, then
2360      // truncate to the right size.
2361      switch (VA.getLocInfo()) {
2362      default: llvm_unreachable("Unknown loc info!");
2363      case CCValAssign::Full: break;
2364      case CCValAssign::BCvt:
2365        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2366        break;
2367      case CCValAssign::SExt:
2368        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2369                               DAG.getValueType(VA.getValVT()));
2370        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2371        break;
2372      case CCValAssign::ZExt:
2373        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2374                               DAG.getValueType(VA.getValVT()));
2375        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2376        break;
2377      }
2378
2379      InVals.push_back(ArgValue);
2380
2381    } else { // VA.isRegLoc()
2382
2383      // sanity check
2384      assert(VA.isMemLoc());
2385      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
2386
2387      int index = ArgLocs[i].getValNo();
2388
2389      // Some Ins[] entries become multiple ArgLoc[] entries.
2390      // Process them only once.
2391      if (index != lastInsIndex)
2392        {
2393          ISD::ArgFlagsTy Flags = Ins[index].Flags;
2394          // FIXME: For now, all byval parameter objects are marked mutable. This can be
2395          // changed with more analysis.
2396          // In case of tail call optimization mark all arguments mutable. Since they
2397          // could be overwritten by lowering of arguments in case of a tail call.
2398          if (Flags.isByVal()) {
2399            unsigned Bytes = Flags.getByValSize();
2400            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2401            int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), false);
2402            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
2403          } else {
2404            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
2405                                            VA.getLocMemOffset(), true);
2406
2407            // Create load nodes to retrieve arguments from the stack.
2408            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2409            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
2410                                         MachinePointerInfo::getFixedStack(FI),
2411                                         false, false, 0));
2412          }
2413          lastInsIndex = index;
2414        }
2415    }
2416  }
2417
2418  // varargs
2419  if (isVarArg) {
2420    static const unsigned GPRArgRegs[] = {
2421      ARM::R0, ARM::R1, ARM::R2, ARM::R3
2422    };
2423
2424    unsigned NumGPRs = CCInfo.getFirstUnallocated
2425      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
2426
2427    unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
2428    unsigned VARegSize = (4 - NumGPRs) * 4;
2429    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
2430    unsigned ArgOffset = CCInfo.getNextStackOffset();
2431    if (VARegSaveSize) {
2432      // If this function is vararg, store any remaining integer argument regs
2433      // to their spots on the stack so that they may be loaded by deferencing
2434      // the result of va_next.
2435      AFI->setVarArgsRegSaveSize(VARegSaveSize);
2436      AFI->setVarArgsFrameIndex(
2437        MFI->CreateFixedObject(VARegSaveSize,
2438                               ArgOffset + VARegSaveSize - VARegSize,
2439                               false));
2440      SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
2441                                      getPointerTy());
2442
2443      SmallVector<SDValue, 4> MemOps;
2444      for (; NumGPRs < 4; ++NumGPRs) {
2445        TargetRegisterClass *RC;
2446        if (AFI->isThumb1OnlyFunction())
2447          RC = ARM::tGPRRegisterClass;
2448        else
2449          RC = ARM::GPRRegisterClass;
2450
2451        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
2452        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
2453        SDValue Store =
2454          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2455               MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
2456                       false, false, 0);
2457        MemOps.push_back(Store);
2458        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
2459                          DAG.getConstant(4, getPointerTy()));
2460      }
2461      if (!MemOps.empty())
2462        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2463                            &MemOps[0], MemOps.size());
2464    } else
2465      // This will point to the next argument passed via stack.
2466      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
2467  }
2468
2469  return Chain;
2470}
2471
2472/// isFloatingPointZero - Return true if this is +0.0.
2473static bool isFloatingPointZero(SDValue Op) {
2474  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
2475    return CFP->getValueAPF().isPosZero();
2476  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
2477    // Maybe this has already been legalized into the constant pool?
2478    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
2479      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
2480      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
2481        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
2482          return CFP->getValueAPF().isPosZero();
2483    }
2484  }
2485  return false;
2486}
2487
2488/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
2489/// the given operands.
2490SDValue
2491ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2492                             SDValue &ARMcc, SelectionDAG &DAG,
2493                             DebugLoc dl) const {
2494  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2495    unsigned C = RHSC->getZExtValue();
2496    if (!isLegalICmpImmediate(C)) {
2497      // Constant does not fit, try adjusting it by one?
2498      switch (CC) {
2499      default: break;
2500      case ISD::SETLT:
2501      case ISD::SETGE:
2502        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
2503          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2504          RHS = DAG.getConstant(C-1, MVT::i32);
2505        }
2506        break;
2507      case ISD::SETULT:
2508      case ISD::SETUGE:
2509        if (C != 0 && isLegalICmpImmediate(C-1)) {
2510          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2511          RHS = DAG.getConstant(C-1, MVT::i32);
2512        }
2513        break;
2514      case ISD::SETLE:
2515      case ISD::SETGT:
2516        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
2517          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2518          RHS = DAG.getConstant(C+1, MVT::i32);
2519        }
2520        break;
2521      case ISD::SETULE:
2522      case ISD::SETUGT:
2523        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
2524          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2525          RHS = DAG.getConstant(C+1, MVT::i32);
2526        }
2527        break;
2528      }
2529    }
2530  }
2531
2532  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
2533  ARMISD::NodeType CompareType;
2534  switch (CondCode) {
2535  default:
2536    CompareType = ARMISD::CMP;
2537    break;
2538  case ARMCC::EQ:
2539  case ARMCC::NE:
2540    // Uses only Z Flag
2541    CompareType = ARMISD::CMPZ;
2542    break;
2543  }
2544  ARMcc = DAG.getConstant(CondCode, MVT::i32);
2545  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
2546}
2547
2548/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
2549SDValue
2550ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
2551                             DebugLoc dl) const {
2552  SDValue Cmp;
2553  if (!isFloatingPointZero(RHS))
2554    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
2555  else
2556    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
2557  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
2558}
2559
2560/// duplicateCmp - Glue values can have only one use, so this function
2561/// duplicates a comparison node.
2562SDValue
2563ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
2564  unsigned Opc = Cmp.getOpcode();
2565  DebugLoc DL = Cmp.getDebugLoc();
2566  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
2567    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
2568
2569  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
2570  Cmp = Cmp.getOperand(0);
2571  Opc = Cmp.getOpcode();
2572  if (Opc == ARMISD::CMPFP)
2573    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
2574  else {
2575    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
2576    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
2577  }
2578  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
2579}
2580
2581SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
2582  SDValue Cond = Op.getOperand(0);
2583  SDValue SelectTrue = Op.getOperand(1);
2584  SDValue SelectFalse = Op.getOperand(2);
2585  DebugLoc dl = Op.getDebugLoc();
2586
2587  // Convert:
2588  //
2589  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
2590  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
2591  //
2592  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
2593    const ConstantSDNode *CMOVTrue =
2594      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
2595    const ConstantSDNode *CMOVFalse =
2596      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2597
2598    if (CMOVTrue && CMOVFalse) {
2599      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
2600      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
2601
2602      SDValue True;
2603      SDValue False;
2604      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
2605        True = SelectTrue;
2606        False = SelectFalse;
2607      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
2608        True = SelectFalse;
2609        False = SelectTrue;
2610      }
2611
2612      if (True.getNode() && False.getNode()) {
2613        EVT VT = Cond.getValueType();
2614        SDValue ARMcc = Cond.getOperand(2);
2615        SDValue CCR = Cond.getOperand(3);
2616        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
2617        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
2618      }
2619    }
2620  }
2621
2622  return DAG.getSelectCC(dl, Cond,
2623                         DAG.getConstant(0, Cond.getValueType()),
2624                         SelectTrue, SelectFalse, ISD::SETNE);
2625}
2626
2627SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
2628  EVT VT = Op.getValueType();
2629  SDValue LHS = Op.getOperand(0);
2630  SDValue RHS = Op.getOperand(1);
2631  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2632  SDValue TrueVal = Op.getOperand(2);
2633  SDValue FalseVal = Op.getOperand(3);
2634  DebugLoc dl = Op.getDebugLoc();
2635
2636  if (LHS.getValueType() == MVT::i32) {
2637    SDValue ARMcc;
2638    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2639    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
2640    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
2641  }
2642
2643  ARMCC::CondCodes CondCode, CondCode2;
2644  FPCCToARMCC(CC, CondCode, CondCode2);
2645
2646  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
2647  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
2648  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2649  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
2650                               ARMcc, CCR, Cmp);
2651  if (CondCode2 != ARMCC::AL) {
2652    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
2653    // FIXME: Needs another CMP because flag can have but one use.
2654    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
2655    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
2656                         Result, TrueVal, ARMcc2, CCR, Cmp2);
2657  }
2658  return Result;
2659}
2660
2661/// canChangeToInt - Given the fp compare operand, return true if it is suitable
2662/// to morph to an integer compare sequence.
2663static bool canChangeToInt(SDValue Op, bool &SeenZero,
2664                           const ARMSubtarget *Subtarget) {
2665  SDNode *N = Op.getNode();
2666  if (!N->hasOneUse())
2667    // Otherwise it requires moving the value from fp to integer registers.
2668    return false;
2669  if (!N->getNumValues())
2670    return false;
2671  EVT VT = Op.getValueType();
2672  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
2673    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
2674    // vmrs are very slow, e.g. cortex-a8.
2675    return false;
2676
2677  if (isFloatingPointZero(Op)) {
2678    SeenZero = true;
2679    return true;
2680  }
2681  return ISD::isNormalLoad(N);
2682}
2683
2684static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
2685  if (isFloatingPointZero(Op))
2686    return DAG.getConstant(0, MVT::i32);
2687
2688  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
2689    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2690                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
2691                       Ld->isVolatile(), Ld->isNonTemporal(),
2692                       Ld->getAlignment());
2693
2694  llvm_unreachable("Unknown VFP cmp argument!");
2695}
2696
2697static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
2698                           SDValue &RetVal1, SDValue &RetVal2) {
2699  if (isFloatingPointZero(Op)) {
2700    RetVal1 = DAG.getConstant(0, MVT::i32);
2701    RetVal2 = DAG.getConstant(0, MVT::i32);
2702    return;
2703  }
2704
2705  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
2706    SDValue Ptr = Ld->getBasePtr();
2707    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2708                          Ld->getChain(), Ptr,
2709                          Ld->getPointerInfo(),
2710                          Ld->isVolatile(), Ld->isNonTemporal(),
2711                          Ld->getAlignment());
2712
2713    EVT PtrType = Ptr.getValueType();
2714    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
2715    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
2716                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
2717    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
2718                          Ld->getChain(), NewPtr,
2719                          Ld->getPointerInfo().getWithOffset(4),
2720                          Ld->isVolatile(), Ld->isNonTemporal(),
2721                          NewAlign);
2722    return;
2723  }
2724
2725  llvm_unreachable("Unknown VFP cmp argument!");
2726}
2727
2728/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
2729/// f32 and even f64 comparisons to integer ones.
2730SDValue
2731ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
2732  SDValue Chain = Op.getOperand(0);
2733  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2734  SDValue LHS = Op.getOperand(2);
2735  SDValue RHS = Op.getOperand(3);
2736  SDValue Dest = Op.getOperand(4);
2737  DebugLoc dl = Op.getDebugLoc();
2738
2739  bool SeenZero = false;
2740  if (canChangeToInt(LHS, SeenZero, Subtarget) &&
2741      canChangeToInt(RHS, SeenZero, Subtarget) &&
2742      // If one of the operand is zero, it's safe to ignore the NaN case since
2743      // we only care about equality comparisons.
2744      (SeenZero || (DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS)))) {
2745    // If unsafe fp math optimization is enabled and there are no other uses of
2746    // the CMP operands, and the condition code is EQ or NE, we can optimize it
2747    // to an integer comparison.
2748    if (CC == ISD::SETOEQ)
2749      CC = ISD::SETEQ;
2750    else if (CC == ISD::SETUNE)
2751      CC = ISD::SETNE;
2752
2753    SDValue ARMcc;
2754    if (LHS.getValueType() == MVT::f32) {
2755      LHS = bitcastf32Toi32(LHS, DAG);
2756      RHS = bitcastf32Toi32(RHS, DAG);
2757      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
2758      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2759      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
2760                         Chain, Dest, ARMcc, CCR, Cmp);
2761    }
2762
2763    SDValue LHS1, LHS2;
2764    SDValue RHS1, RHS2;
2765    expandf64Toi32(LHS, DAG, LHS1, LHS2);
2766    expandf64Toi32(RHS, DAG, RHS1, RHS2);
2767    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
2768    ARMcc = DAG.getConstant(CondCode, MVT::i32);
2769    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
2770    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
2771    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
2772  }
2773
2774  return SDValue();
2775}
2776
2777SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
2778  SDValue Chain = Op.getOperand(0);
2779  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2780  SDValue LHS = Op.getOperand(2);
2781  SDValue RHS = Op.getOperand(3);
2782  SDValue Dest = Op.getOperand(4);
2783  DebugLoc dl = Op.getDebugLoc();
2784
2785  if (LHS.getValueType() == MVT::i32) {
2786    SDValue ARMcc;
2787    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
2788    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2789    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
2790                       Chain, Dest, ARMcc, CCR, Cmp);
2791  }
2792
2793  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
2794
2795  if (UnsafeFPMath &&
2796      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
2797       CC == ISD::SETNE || CC == ISD::SETUNE)) {
2798    SDValue Result = OptimizeVFPBrcond(Op, DAG);
2799    if (Result.getNode())
2800      return Result;
2801  }
2802
2803  ARMCC::CondCodes CondCode, CondCode2;
2804  FPCCToARMCC(CC, CondCode, CondCode2);
2805
2806  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
2807  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
2808  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
2809  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
2810  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
2811  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
2812  if (CondCode2 != ARMCC::AL) {
2813    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
2814    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
2815    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
2816  }
2817  return Res;
2818}
2819
2820SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2821  SDValue Chain = Op.getOperand(0);
2822  SDValue Table = Op.getOperand(1);
2823  SDValue Index = Op.getOperand(2);
2824  DebugLoc dl = Op.getDebugLoc();
2825
2826  EVT PTy = getPointerTy();
2827  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
2828  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2829  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
2830  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
2831  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
2832  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
2833  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
2834  if (Subtarget->isThumb2()) {
2835    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
2836    // which does another jump to the destination. This also makes it easier
2837    // to translate it to TBB / TBH later.
2838    // FIXME: This might not work if the function is extremely large.
2839    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
2840                       Addr, Op.getOperand(2), JTI, UId);
2841  }
2842  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2843    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
2844                       MachinePointerInfo::getJumpTable(),
2845                       false, false, 0);
2846    Chain = Addr.getValue(1);
2847    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
2848    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
2849  } else {
2850    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
2851                       MachinePointerInfo::getJumpTable(), false, false, 0);
2852    Chain = Addr.getValue(1);
2853    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
2854  }
2855}
2856
2857static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
2858  DebugLoc dl = Op.getDebugLoc();
2859  unsigned Opc;
2860
2861  switch (Op.getOpcode()) {
2862  default:
2863    assert(0 && "Invalid opcode!");
2864  case ISD::FP_TO_SINT:
2865    Opc = ARMISD::FTOSI;
2866    break;
2867  case ISD::FP_TO_UINT:
2868    Opc = ARMISD::FTOUI;
2869    break;
2870  }
2871  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
2872  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
2873}
2874
2875static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
2876  EVT VT = Op.getValueType();
2877  DebugLoc dl = Op.getDebugLoc();
2878
2879  EVT OperandVT = Op.getOperand(0).getValueType();
2880  assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!");
2881  if (VT != MVT::v4f32)
2882    return DAG.UnrollVectorOp(Op.getNode());
2883
2884  unsigned CastOpc;
2885  unsigned Opc;
2886  switch (Op.getOpcode()) {
2887  default:
2888    assert(0 && "Invalid opcode!");
2889  case ISD::SINT_TO_FP:
2890    CastOpc = ISD::SIGN_EXTEND;
2891    Opc = ISD::SINT_TO_FP;
2892    break;
2893  case ISD::UINT_TO_FP:
2894    CastOpc = ISD::ZERO_EXTEND;
2895    Opc = ISD::UINT_TO_FP;
2896    break;
2897  }
2898
2899  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
2900  return DAG.getNode(Opc, dl, VT, Op);
2901}
2902
2903static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
2904  EVT VT = Op.getValueType();
2905  if (VT.isVector())
2906    return LowerVectorINT_TO_FP(Op, DAG);
2907
2908  DebugLoc dl = Op.getDebugLoc();
2909  unsigned Opc;
2910
2911  switch (Op.getOpcode()) {
2912  default:
2913    assert(0 && "Invalid opcode!");
2914  case ISD::SINT_TO_FP:
2915    Opc = ARMISD::SITOF;
2916    break;
2917  case ISD::UINT_TO_FP:
2918    Opc = ARMISD::UITOF;
2919    break;
2920  }
2921
2922  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
2923  return DAG.getNode(Opc, dl, VT, Op);
2924}
2925
2926SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
2927  // Implement fcopysign with a fabs and a conditional fneg.
2928  SDValue Tmp0 = Op.getOperand(0);
2929  SDValue Tmp1 = Op.getOperand(1);
2930  DebugLoc dl = Op.getDebugLoc();
2931  EVT VT = Op.getValueType();
2932  EVT SrcVT = Tmp1.getValueType();
2933  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
2934    Tmp0.getOpcode() == ARMISD::VMOVDRR;
2935  bool UseNEON = !InGPR && Subtarget->hasNEON();
2936
2937  if (UseNEON) {
2938    // Use VBSL to copy the sign bit.
2939    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
2940    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
2941                               DAG.getTargetConstant(EncodedVal, MVT::i32));
2942    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
2943    if (VT == MVT::f64)
2944      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
2945                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
2946                         DAG.getConstant(32, MVT::i32));
2947    else /*if (VT == MVT::f32)*/
2948      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
2949    if (SrcVT == MVT::f32) {
2950      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
2951      if (VT == MVT::f64)
2952        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
2953                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
2954                           DAG.getConstant(32, MVT::i32));
2955    }
2956    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
2957    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
2958
2959    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
2960                                            MVT::i32);
2961    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
2962    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
2963                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
2964
2965    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
2966                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
2967                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
2968    if (VT == MVT::f32) {
2969      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
2970      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
2971                        DAG.getConstant(0, MVT::i32));
2972    } else {
2973      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
2974    }
2975
2976    return Res;
2977  }
2978
2979  // Bitcast operand 1 to i32.
2980  if (SrcVT == MVT::f64)
2981    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
2982                       &Tmp1, 1).getValue(1);
2983  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
2984
2985  // Or in the signbit with integer operations.
2986  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
2987  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
2988  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
2989  if (VT == MVT::f32) {
2990    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
2991                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
2992    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
2993                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
2994  }
2995
2996  // f64: Or the high part with signbit and then combine two parts.
2997  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
2998                     &Tmp0, 1);
2999  SDValue Lo = Tmp0.getValue(0);
3000  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
3001  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
3002  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
3003}
3004
3005SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
3006  MachineFunction &MF = DAG.getMachineFunction();
3007  MachineFrameInfo *MFI = MF.getFrameInfo();
3008  MFI->setReturnAddressIsTaken(true);
3009
3010  EVT VT = Op.getValueType();
3011  DebugLoc dl = Op.getDebugLoc();
3012  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3013  if (Depth) {
3014    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
3015    SDValue Offset = DAG.getConstant(4, MVT::i32);
3016    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
3017                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
3018                       MachinePointerInfo(), false, false, 0);
3019  }
3020
3021  // Return LR, which contains the return address. Mark it an implicit live-in.
3022  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3023  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
3024}
3025
3026SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
3027  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3028  MFI->setFrameAddressIsTaken(true);
3029
3030  EVT VT = Op.getValueType();
3031  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
3032  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3033  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
3034    ? ARM::R7 : ARM::R11;
3035  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
3036  while (Depth--)
3037    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
3038                            MachinePointerInfo(),
3039                            false, false, 0);
3040  return FrameAddr;
3041}
3042
3043/// ExpandBITCAST - If the target supports VFP, this function is called to
3044/// expand a bit convert where either the source or destination type is i64 to
3045/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
3046/// operand type is illegal (e.g., v2f32 for a target that doesn't support
3047/// vectors), since the legalizer won't know what to do with that.
3048static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
3049  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3050  DebugLoc dl = N->getDebugLoc();
3051  SDValue Op = N->getOperand(0);
3052
3053  // This function is only supposed to be called for i64 types, either as the
3054  // source or destination of the bit convert.
3055  EVT SrcVT = Op.getValueType();
3056  EVT DstVT = N->getValueType(0);
3057  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
3058         "ExpandBITCAST called for non-i64 type");
3059
3060  // Turn i64->f64 into VMOVDRR.
3061  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
3062    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3063                             DAG.getConstant(0, MVT::i32));
3064    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
3065                             DAG.getConstant(1, MVT::i32));
3066    return DAG.getNode(ISD::BITCAST, dl, DstVT,
3067                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
3068  }
3069
3070  // Turn f64->i64 into VMOVRRD.
3071  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
3072    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
3073                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
3074    // Merge the pieces into a single i64 value.
3075    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
3076  }
3077
3078  return SDValue();
3079}
3080
3081/// getZeroVector - Returns a vector of specified type with all zero elements.
3082/// Zero vectors are used to represent vector negation and in those cases
3083/// will be implemented with the NEON VNEG instruction.  However, VNEG does
3084/// not support i64 elements, so sometimes the zero vectors will need to be
3085/// explicitly constructed.  Regardless, use a canonical VMOV to create the
3086/// zero vector.
3087static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3088  assert(VT.isVector() && "Expected a vector type");
3089  // The canonical modified immediate encoding of a zero vector is....0!
3090  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
3091  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
3092  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
3093  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3094}
3095
3096/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
3097/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3098SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
3099                                                SelectionDAG &DAG) const {
3100  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3101  EVT VT = Op.getValueType();
3102  unsigned VTBits = VT.getSizeInBits();
3103  DebugLoc dl = Op.getDebugLoc();
3104  SDValue ShOpLo = Op.getOperand(0);
3105  SDValue ShOpHi = Op.getOperand(1);
3106  SDValue ShAmt  = Op.getOperand(2);
3107  SDValue ARMcc;
3108  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
3109
3110  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
3111
3112  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3113                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3114  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
3115  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3116                                   DAG.getConstant(VTBits, MVT::i32));
3117  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
3118  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3119  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
3120
3121  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3122  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3123                          ARMcc, DAG, dl);
3124  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
3125  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
3126                           CCR, Cmp);
3127
3128  SDValue Ops[2] = { Lo, Hi };
3129  return DAG.getMergeValues(Ops, 2, dl);
3130}
3131
3132/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
3133/// i32 values and take a 2 x i32 value to shift plus a shift amount.
3134SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
3135                                               SelectionDAG &DAG) const {
3136  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
3137  EVT VT = Op.getValueType();
3138  unsigned VTBits = VT.getSizeInBits();
3139  DebugLoc dl = Op.getDebugLoc();
3140  SDValue ShOpLo = Op.getOperand(0);
3141  SDValue ShOpHi = Op.getOperand(1);
3142  SDValue ShAmt  = Op.getOperand(2);
3143  SDValue ARMcc;
3144
3145  assert(Op.getOpcode() == ISD::SHL_PARTS);
3146  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
3147                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
3148  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
3149  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
3150                                   DAG.getConstant(VTBits, MVT::i32));
3151  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
3152  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
3153
3154  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
3155  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
3156  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
3157                          ARMcc, DAG, dl);
3158  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
3159  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
3160                           CCR, Cmp);
3161
3162  SDValue Ops[2] = { Lo, Hi };
3163  return DAG.getMergeValues(Ops, 2, dl);
3164}
3165
3166SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
3167                                            SelectionDAG &DAG) const {
3168  // The rounding mode is in bits 23:22 of the FPSCR.
3169  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
3170  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
3171  // so that the shift + and get folded into a bitfield extract.
3172  DebugLoc dl = Op.getDebugLoc();
3173  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
3174                              DAG.getConstant(Intrinsic::arm_get_fpscr,
3175                                              MVT::i32));
3176  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
3177                                  DAG.getConstant(1U << 22, MVT::i32));
3178  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
3179                              DAG.getConstant(22, MVT::i32));
3180  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
3181                     DAG.getConstant(3, MVT::i32));
3182}
3183
3184static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
3185                         const ARMSubtarget *ST) {
3186  EVT VT = N->getValueType(0);
3187  DebugLoc dl = N->getDebugLoc();
3188
3189  if (!ST->hasV6T2Ops())
3190    return SDValue();
3191
3192  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
3193  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
3194}
3195
3196static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
3197                          const ARMSubtarget *ST) {
3198  EVT VT = N->getValueType(0);
3199  DebugLoc dl = N->getDebugLoc();
3200
3201  if (!VT.isVector())
3202    return SDValue();
3203
3204  // Lower vector shifts on NEON to use VSHL.
3205  assert(ST->hasNEON() && "unexpected vector shift");
3206
3207  // Left shifts translate directly to the vshiftu intrinsic.
3208  if (N->getOpcode() == ISD::SHL)
3209    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3210                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
3211                       N->getOperand(0), N->getOperand(1));
3212
3213  assert((N->getOpcode() == ISD::SRA ||
3214          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
3215
3216  // NEON uses the same intrinsics for both left and right shifts.  For
3217  // right shifts, the shift amounts are negative, so negate the vector of
3218  // shift amounts.
3219  EVT ShiftVT = N->getOperand(1).getValueType();
3220  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
3221                                     getZeroVector(ShiftVT, DAG, dl),
3222                                     N->getOperand(1));
3223  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
3224                             Intrinsic::arm_neon_vshifts :
3225                             Intrinsic::arm_neon_vshiftu);
3226  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
3227                     DAG.getConstant(vshiftInt, MVT::i32),
3228                     N->getOperand(0), NegatedCount);
3229}
3230
3231static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
3232                                const ARMSubtarget *ST) {
3233  EVT VT = N->getValueType(0);
3234  DebugLoc dl = N->getDebugLoc();
3235
3236  // We can get here for a node like i32 = ISD::SHL i32, i64
3237  if (VT != MVT::i64)
3238    return SDValue();
3239
3240  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
3241         "Unknown shift to lower!");
3242
3243  // We only lower SRA, SRL of 1 here, all others use generic lowering.
3244  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
3245      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
3246    return SDValue();
3247
3248  // If we are in thumb mode, we don't have RRX.
3249  if (ST->isThumb1Only()) return SDValue();
3250
3251  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
3252  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3253                           DAG.getConstant(0, MVT::i32));
3254  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
3255                           DAG.getConstant(1, MVT::i32));
3256
3257  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
3258  // captures the result into a carry flag.
3259  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
3260  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
3261
3262  // The low part is an ARMISD::RRX operand, which shifts the carry in.
3263  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
3264
3265  // Merge the pieces into a single i64 value.
3266 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
3267}
3268
3269static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
3270  SDValue TmpOp0, TmpOp1;
3271  bool Invert = false;
3272  bool Swap = false;
3273  unsigned Opc = 0;
3274
3275  SDValue Op0 = Op.getOperand(0);
3276  SDValue Op1 = Op.getOperand(1);
3277  SDValue CC = Op.getOperand(2);
3278  EVT VT = Op.getValueType();
3279  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
3280  DebugLoc dl = Op.getDebugLoc();
3281
3282  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
3283    switch (SetCCOpcode) {
3284    default: llvm_unreachable("Illegal FP comparison"); break;
3285    case ISD::SETUNE:
3286    case ISD::SETNE:  Invert = true; // Fallthrough
3287    case ISD::SETOEQ:
3288    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
3289    case ISD::SETOLT:
3290    case ISD::SETLT: Swap = true; // Fallthrough
3291    case ISD::SETOGT:
3292    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
3293    case ISD::SETOLE:
3294    case ISD::SETLE:  Swap = true; // Fallthrough
3295    case ISD::SETOGE:
3296    case ISD::SETGE: Opc = ARMISD::VCGE; break;
3297    case ISD::SETUGE: Swap = true; // Fallthrough
3298    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
3299    case ISD::SETUGT: Swap = true; // Fallthrough
3300    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
3301    case ISD::SETUEQ: Invert = true; // Fallthrough
3302    case ISD::SETONE:
3303      // Expand this to (OLT | OGT).
3304      TmpOp0 = Op0;
3305      TmpOp1 = Op1;
3306      Opc = ISD::OR;
3307      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
3308      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
3309      break;
3310    case ISD::SETUO: Invert = true; // Fallthrough
3311    case ISD::SETO:
3312      // Expand this to (OLT | OGE).
3313      TmpOp0 = Op0;
3314      TmpOp1 = Op1;
3315      Opc = ISD::OR;
3316      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
3317      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
3318      break;
3319    }
3320  } else {
3321    // Integer comparisons.
3322    switch (SetCCOpcode) {
3323    default: llvm_unreachable("Illegal integer comparison"); break;
3324    case ISD::SETNE:  Invert = true;
3325    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
3326    case ISD::SETLT:  Swap = true;
3327    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
3328    case ISD::SETLE:  Swap = true;
3329    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
3330    case ISD::SETULT: Swap = true;
3331    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
3332    case ISD::SETULE: Swap = true;
3333    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
3334    }
3335
3336    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
3337    if (Opc == ARMISD::VCEQ) {
3338
3339      SDValue AndOp;
3340      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
3341        AndOp = Op0;
3342      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
3343        AndOp = Op1;
3344
3345      // Ignore bitconvert.
3346      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
3347        AndOp = AndOp.getOperand(0);
3348
3349      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
3350        Opc = ARMISD::VTST;
3351        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
3352        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
3353        Invert = !Invert;
3354      }
3355    }
3356  }
3357
3358  if (Swap)
3359    std::swap(Op0, Op1);
3360
3361  // If one of the operands is a constant vector zero, attempt to fold the
3362  // comparison to a specialized compare-against-zero form.
3363  SDValue SingleOp;
3364  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
3365    SingleOp = Op0;
3366  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
3367    if (Opc == ARMISD::VCGE)
3368      Opc = ARMISD::VCLEZ;
3369    else if (Opc == ARMISD::VCGT)
3370      Opc = ARMISD::VCLTZ;
3371    SingleOp = Op1;
3372  }
3373
3374  SDValue Result;
3375  if (SingleOp.getNode()) {
3376    switch (Opc) {
3377    case ARMISD::VCEQ:
3378      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
3379    case ARMISD::VCGE:
3380      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
3381    case ARMISD::VCLEZ:
3382      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
3383    case ARMISD::VCGT:
3384      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
3385    case ARMISD::VCLTZ:
3386      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
3387    default:
3388      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
3389    }
3390  } else {
3391     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
3392  }
3393
3394  if (Invert)
3395    Result = DAG.getNOT(dl, Result, VT);
3396
3397  return Result;
3398}
3399
3400/// isNEONModifiedImm - Check if the specified splat value corresponds to a
3401/// valid vector constant for a NEON instruction with a "modified immediate"
3402/// operand (e.g., VMOV).  If so, return the encoded value.
3403static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
3404                                 unsigned SplatBitSize, SelectionDAG &DAG,
3405                                 EVT &VT, bool is128Bits, NEONModImmType type) {
3406  unsigned OpCmode, Imm;
3407
3408  // SplatBitSize is set to the smallest size that splats the vector, so a
3409  // zero vector will always have SplatBitSize == 8.  However, NEON modified
3410  // immediate instructions others than VMOV do not support the 8-bit encoding
3411  // of a zero vector, and the default encoding of zero is supposed to be the
3412  // 32-bit version.
3413  if (SplatBits == 0)
3414    SplatBitSize = 32;
3415
3416  switch (SplatBitSize) {
3417  case 8:
3418    if (type != VMOVModImm)
3419      return SDValue();
3420    // Any 1-byte value is OK.  Op=0, Cmode=1110.
3421    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
3422    OpCmode = 0xe;
3423    Imm = SplatBits;
3424    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
3425    break;
3426
3427  case 16:
3428    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
3429    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
3430    if ((SplatBits & ~0xff) == 0) {
3431      // Value = 0x00nn: Op=x, Cmode=100x.
3432      OpCmode = 0x8;
3433      Imm = SplatBits;
3434      break;
3435    }
3436    if ((SplatBits & ~0xff00) == 0) {
3437      // Value = 0xnn00: Op=x, Cmode=101x.
3438      OpCmode = 0xa;
3439      Imm = SplatBits >> 8;
3440      break;
3441    }
3442    return SDValue();
3443
3444  case 32:
3445    // NEON's 32-bit VMOV supports splat values where:
3446    // * only one byte is nonzero, or
3447    // * the least significant byte is 0xff and the second byte is nonzero, or
3448    // * the least significant 2 bytes are 0xff and the third is nonzero.
3449    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
3450    if ((SplatBits & ~0xff) == 0) {
3451      // Value = 0x000000nn: Op=x, Cmode=000x.
3452      OpCmode = 0;
3453      Imm = SplatBits;
3454      break;
3455    }
3456    if ((SplatBits & ~0xff00) == 0) {
3457      // Value = 0x0000nn00: Op=x, Cmode=001x.
3458      OpCmode = 0x2;
3459      Imm = SplatBits >> 8;
3460      break;
3461    }
3462    if ((SplatBits & ~0xff0000) == 0) {
3463      // Value = 0x00nn0000: Op=x, Cmode=010x.
3464      OpCmode = 0x4;
3465      Imm = SplatBits >> 16;
3466      break;
3467    }
3468    if ((SplatBits & ~0xff000000) == 0) {
3469      // Value = 0xnn000000: Op=x, Cmode=011x.
3470      OpCmode = 0x6;
3471      Imm = SplatBits >> 24;
3472      break;
3473    }
3474
3475    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
3476    if (type == OtherModImm) return SDValue();
3477
3478    if ((SplatBits & ~0xffff) == 0 &&
3479        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
3480      // Value = 0x0000nnff: Op=x, Cmode=1100.
3481      OpCmode = 0xc;
3482      Imm = SplatBits >> 8;
3483      SplatBits |= 0xff;
3484      break;
3485    }
3486
3487    if ((SplatBits & ~0xffffff) == 0 &&
3488        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
3489      // Value = 0x00nnffff: Op=x, Cmode=1101.
3490      OpCmode = 0xd;
3491      Imm = SplatBits >> 16;
3492      SplatBits |= 0xffff;
3493      break;
3494    }
3495
3496    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
3497    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
3498    // VMOV.I32.  A (very) minor optimization would be to replicate the value
3499    // and fall through here to test for a valid 64-bit splat.  But, then the
3500    // caller would also need to check and handle the change in size.
3501    return SDValue();
3502
3503  case 64: {
3504    if (type != VMOVModImm)
3505      return SDValue();
3506    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
3507    uint64_t BitMask = 0xff;
3508    uint64_t Val = 0;
3509    unsigned ImmMask = 1;
3510    Imm = 0;
3511    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
3512      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
3513        Val |= BitMask;
3514        Imm |= ImmMask;
3515      } else if ((SplatBits & BitMask) != 0) {
3516        return SDValue();
3517      }
3518      BitMask <<= 8;
3519      ImmMask <<= 1;
3520    }
3521    // Op=1, Cmode=1110.
3522    OpCmode = 0x1e;
3523    SplatBits = Val;
3524    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
3525    break;
3526  }
3527
3528  default:
3529    llvm_unreachable("unexpected size for isNEONModifiedImm");
3530    return SDValue();
3531  }
3532
3533  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
3534  return DAG.getTargetConstant(EncodedVal, MVT::i32);
3535}
3536
3537static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
3538                       bool &ReverseVEXT, unsigned &Imm) {
3539  unsigned NumElts = VT.getVectorNumElements();
3540  ReverseVEXT = false;
3541
3542  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
3543  if (M[0] < 0)
3544    return false;
3545
3546  Imm = M[0];
3547
3548  // If this is a VEXT shuffle, the immediate value is the index of the first
3549  // element.  The other shuffle indices must be the successive elements after
3550  // the first one.
3551  unsigned ExpectedElt = Imm;
3552  for (unsigned i = 1; i < NumElts; ++i) {
3553    // Increment the expected index.  If it wraps around, it may still be
3554    // a VEXT but the source vectors must be swapped.
3555    ExpectedElt += 1;
3556    if (ExpectedElt == NumElts * 2) {
3557      ExpectedElt = 0;
3558      ReverseVEXT = true;
3559    }
3560
3561    if (M[i] < 0) continue; // ignore UNDEF indices
3562    if (ExpectedElt != static_cast<unsigned>(M[i]))
3563      return false;
3564  }
3565
3566  // Adjust the index value if the source operands will be swapped.
3567  if (ReverseVEXT)
3568    Imm -= NumElts;
3569
3570  return true;
3571}
3572
3573/// isVREVMask - Check if a vector shuffle corresponds to a VREV
3574/// instruction with the specified blocksize.  (The order of the elements
3575/// within each block of the vector is reversed.)
3576static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
3577                       unsigned BlockSize) {
3578  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
3579         "Only possible block sizes for VREV are: 16, 32, 64");
3580
3581  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3582  if (EltSz == 64)
3583    return false;
3584
3585  unsigned NumElts = VT.getVectorNumElements();
3586  unsigned BlockElts = M[0] + 1;
3587  // If the first shuffle index is UNDEF, be optimistic.
3588  if (M[0] < 0)
3589    BlockElts = BlockSize / EltSz;
3590
3591  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
3592    return false;
3593
3594  for (unsigned i = 0; i < NumElts; ++i) {
3595    if (M[i] < 0) continue; // ignore UNDEF indices
3596    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
3597      return false;
3598  }
3599
3600  return true;
3601}
3602
3603static bool isVTBLMask(const SmallVectorImpl<int> &M, EVT VT) {
3604  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
3605  // range, then 0 is placed into the resulting vector. So pretty much any mask
3606  // of 8 elements can work here.
3607  return VT == MVT::v8i8 && M.size() == 8;
3608}
3609
3610static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
3611                       unsigned &WhichResult) {
3612  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3613  if (EltSz == 64)
3614    return false;
3615
3616  unsigned NumElts = VT.getVectorNumElements();
3617  WhichResult = (M[0] == 0 ? 0 : 1);
3618  for (unsigned i = 0; i < NumElts; i += 2) {
3619    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
3620        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
3621      return false;
3622  }
3623  return true;
3624}
3625
3626/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
3627/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
3628/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
3629static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
3630                                unsigned &WhichResult) {
3631  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3632  if (EltSz == 64)
3633    return false;
3634
3635  unsigned NumElts = VT.getVectorNumElements();
3636  WhichResult = (M[0] == 0 ? 0 : 1);
3637  for (unsigned i = 0; i < NumElts; i += 2) {
3638    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
3639        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
3640      return false;
3641  }
3642  return true;
3643}
3644
3645static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
3646                       unsigned &WhichResult) {
3647  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3648  if (EltSz == 64)
3649    return false;
3650
3651  unsigned NumElts = VT.getVectorNumElements();
3652  WhichResult = (M[0] == 0 ? 0 : 1);
3653  for (unsigned i = 0; i != NumElts; ++i) {
3654    if (M[i] < 0) continue; // ignore UNDEF indices
3655    if ((unsigned) M[i] != 2 * i + WhichResult)
3656      return false;
3657  }
3658
3659  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
3660  if (VT.is64BitVector() && EltSz == 32)
3661    return false;
3662
3663  return true;
3664}
3665
3666/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
3667/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
3668/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
3669static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
3670                                unsigned &WhichResult) {
3671  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3672  if (EltSz == 64)
3673    return false;
3674
3675  unsigned Half = VT.getVectorNumElements() / 2;
3676  WhichResult = (M[0] == 0 ? 0 : 1);
3677  for (unsigned j = 0; j != 2; ++j) {
3678    unsigned Idx = WhichResult;
3679    for (unsigned i = 0; i != Half; ++i) {
3680      int MIdx = M[i + j * Half];
3681      if (MIdx >= 0 && (unsigned) MIdx != Idx)
3682        return false;
3683      Idx += 2;
3684    }
3685  }
3686
3687  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
3688  if (VT.is64BitVector() && EltSz == 32)
3689    return false;
3690
3691  return true;
3692}
3693
3694static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
3695                       unsigned &WhichResult) {
3696  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3697  if (EltSz == 64)
3698    return false;
3699
3700  unsigned NumElts = VT.getVectorNumElements();
3701  WhichResult = (M[0] == 0 ? 0 : 1);
3702  unsigned Idx = WhichResult * NumElts / 2;
3703  for (unsigned i = 0; i != NumElts; i += 2) {
3704    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
3705        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
3706      return false;
3707    Idx += 1;
3708  }
3709
3710  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
3711  if (VT.is64BitVector() && EltSz == 32)
3712    return false;
3713
3714  return true;
3715}
3716
3717/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
3718/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
3719/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
3720static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
3721                                unsigned &WhichResult) {
3722  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
3723  if (EltSz == 64)
3724    return false;
3725
3726  unsigned NumElts = VT.getVectorNumElements();
3727  WhichResult = (M[0] == 0 ? 0 : 1);
3728  unsigned Idx = WhichResult * NumElts / 2;
3729  for (unsigned i = 0; i != NumElts; i += 2) {
3730    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
3731        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
3732      return false;
3733    Idx += 1;
3734  }
3735
3736  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
3737  if (VT.is64BitVector() && EltSz == 32)
3738    return false;
3739
3740  return true;
3741}
3742
3743// If N is an integer constant that can be moved into a register in one
3744// instruction, return an SDValue of such a constant (will become a MOV
3745// instruction).  Otherwise return null.
3746static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
3747                                     const ARMSubtarget *ST, DebugLoc dl) {
3748  uint64_t Val;
3749  if (!isa<ConstantSDNode>(N))
3750    return SDValue();
3751  Val = cast<ConstantSDNode>(N)->getZExtValue();
3752
3753  if (ST->isThumb1Only()) {
3754    if (Val <= 255 || ~Val <= 255)
3755      return DAG.getConstant(Val, MVT::i32);
3756  } else {
3757    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
3758      return DAG.getConstant(Val, MVT::i32);
3759  }
3760  return SDValue();
3761}
3762
3763// If this is a case we can't handle, return null and let the default
3764// expansion code take care of it.
3765SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
3766                                             const ARMSubtarget *ST) const {
3767  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
3768  DebugLoc dl = Op.getDebugLoc();
3769  EVT VT = Op.getValueType();
3770
3771  APInt SplatBits, SplatUndef;
3772  unsigned SplatBitSize;
3773  bool HasAnyUndefs;
3774  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
3775    if (SplatBitSize <= 64) {
3776      // Check if an immediate VMOV works.
3777      EVT VmovVT;
3778      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
3779                                      SplatUndef.getZExtValue(), SplatBitSize,
3780                                      DAG, VmovVT, VT.is128BitVector(),
3781                                      VMOVModImm);
3782      if (Val.getNode()) {
3783        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
3784        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3785      }
3786
3787      // Try an immediate VMVN.
3788      uint64_t NegatedImm = (SplatBits.getZExtValue() ^
3789                             ((1LL << SplatBitSize) - 1));
3790      Val = isNEONModifiedImm(NegatedImm,
3791                                      SplatUndef.getZExtValue(), SplatBitSize,
3792                                      DAG, VmovVT, VT.is128BitVector(),
3793                                      VMVNModImm);
3794      if (Val.getNode()) {
3795        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
3796        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
3797      }
3798    }
3799  }
3800
3801  // Scan through the operands to see if only one value is used.
3802  unsigned NumElts = VT.getVectorNumElements();
3803  bool isOnlyLowElement = true;
3804  bool usesOnlyOneValue = true;
3805  bool isConstant = true;
3806  SDValue Value;
3807  for (unsigned i = 0; i < NumElts; ++i) {
3808    SDValue V = Op.getOperand(i);
3809    if (V.getOpcode() == ISD::UNDEF)
3810      continue;
3811    if (i > 0)
3812      isOnlyLowElement = false;
3813    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
3814      isConstant = false;
3815
3816    if (!Value.getNode())
3817      Value = V;
3818    else if (V != Value)
3819      usesOnlyOneValue = false;
3820  }
3821
3822  if (!Value.getNode())
3823    return DAG.getUNDEF(VT);
3824
3825  if (isOnlyLowElement)
3826    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
3827
3828  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
3829
3830  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
3831  // i32 and try again.
3832  if (usesOnlyOneValue && EltSize <= 32) {
3833    if (!isConstant)
3834      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
3835    if (VT.getVectorElementType().isFloatingPoint()) {
3836      SmallVector<SDValue, 8> Ops;
3837      for (unsigned i = 0; i < NumElts; ++i)
3838        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
3839                                  Op.getOperand(i)));
3840      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
3841      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
3842      Val = LowerBUILD_VECTOR(Val, DAG, ST);
3843      if (Val.getNode())
3844        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
3845    }
3846    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
3847    if (Val.getNode())
3848      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
3849  }
3850
3851  // If all elements are constants and the case above didn't get hit, fall back
3852  // to the default expansion, which will generate a load from the constant
3853  // pool.
3854  if (isConstant)
3855    return SDValue();
3856
3857  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
3858  if (NumElts >= 4) {
3859    SDValue shuffle = ReconstructShuffle(Op, DAG);
3860    if (shuffle != SDValue())
3861      return shuffle;
3862  }
3863
3864  // Vectors with 32- or 64-bit elements can be built by directly assigning
3865  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
3866  // will be legalized.
3867  if (EltSize >= 32) {
3868    // Do the expansion with floating-point types, since that is what the VFP
3869    // registers are defined to use, and since i64 is not legal.
3870    EVT EltVT = EVT::getFloatingPointVT(EltSize);
3871    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
3872    SmallVector<SDValue, 8> Ops;
3873    for (unsigned i = 0; i < NumElts; ++i)
3874      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
3875    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
3876    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
3877  }
3878
3879  return SDValue();
3880}
3881
3882// Gather data to see if the operation can be modelled as a
3883// shuffle in combination with VEXTs.
3884SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
3885                                              SelectionDAG &DAG) const {
3886  DebugLoc dl = Op.getDebugLoc();
3887  EVT VT = Op.getValueType();
3888  unsigned NumElts = VT.getVectorNumElements();
3889
3890  SmallVector<SDValue, 2> SourceVecs;
3891  SmallVector<unsigned, 2> MinElts;
3892  SmallVector<unsigned, 2> MaxElts;
3893
3894  for (unsigned i = 0; i < NumElts; ++i) {
3895    SDValue V = Op.getOperand(i);
3896    if (V.getOpcode() == ISD::UNDEF)
3897      continue;
3898    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
3899      // A shuffle can only come from building a vector from various
3900      // elements of other vectors.
3901      return SDValue();
3902    }
3903
3904    // Record this extraction against the appropriate vector if possible...
3905    SDValue SourceVec = V.getOperand(0);
3906    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
3907    bool FoundSource = false;
3908    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
3909      if (SourceVecs[j] == SourceVec) {
3910        if (MinElts[j] > EltNo)
3911          MinElts[j] = EltNo;
3912        if (MaxElts[j] < EltNo)
3913          MaxElts[j] = EltNo;
3914        FoundSource = true;
3915        break;
3916      }
3917    }
3918
3919    // Or record a new source if not...
3920    if (!FoundSource) {
3921      SourceVecs.push_back(SourceVec);
3922      MinElts.push_back(EltNo);
3923      MaxElts.push_back(EltNo);
3924    }
3925  }
3926
3927  // Currently only do something sane when at most two source vectors
3928  // involved.
3929  if (SourceVecs.size() > 2)
3930    return SDValue();
3931
3932  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
3933  int VEXTOffsets[2] = {0, 0};
3934
3935  // This loop extracts the usage patterns of the source vectors
3936  // and prepares appropriate SDValues for a shuffle if possible.
3937  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
3938    if (SourceVecs[i].getValueType() == VT) {
3939      // No VEXT necessary
3940      ShuffleSrcs[i] = SourceVecs[i];
3941      VEXTOffsets[i] = 0;
3942      continue;
3943    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
3944      // It probably isn't worth padding out a smaller vector just to
3945      // break it down again in a shuffle.
3946      return SDValue();
3947    }
3948
3949    // Since only 64-bit and 128-bit vectors are legal on ARM and
3950    // we've eliminated the other cases...
3951    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
3952           "unexpected vector sizes in ReconstructShuffle");
3953
3954    if (MaxElts[i] - MinElts[i] >= NumElts) {
3955      // Span too large for a VEXT to cope
3956      return SDValue();
3957    }
3958
3959    if (MinElts[i] >= NumElts) {
3960      // The extraction can just take the second half
3961      VEXTOffsets[i] = NumElts;
3962      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
3963                                   SourceVecs[i],
3964                                   DAG.getIntPtrConstant(NumElts));
3965    } else if (MaxElts[i] < NumElts) {
3966      // The extraction can just take the first half
3967      VEXTOffsets[i] = 0;
3968      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
3969                                   SourceVecs[i],
3970                                   DAG.getIntPtrConstant(0));
3971    } else {
3972      // An actual VEXT is needed
3973      VEXTOffsets[i] = MinElts[i];
3974      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
3975                                     SourceVecs[i],
3976                                     DAG.getIntPtrConstant(0));
3977      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
3978                                     SourceVecs[i],
3979                                     DAG.getIntPtrConstant(NumElts));
3980      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
3981                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
3982    }
3983  }
3984
3985  SmallVector<int, 8> Mask;
3986
3987  for (unsigned i = 0; i < NumElts; ++i) {
3988    SDValue Entry = Op.getOperand(i);
3989    if (Entry.getOpcode() == ISD::UNDEF) {
3990      Mask.push_back(-1);
3991      continue;
3992    }
3993
3994    SDValue ExtractVec = Entry.getOperand(0);
3995    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
3996                                          .getOperand(1))->getSExtValue();
3997    if (ExtractVec == SourceVecs[0]) {
3998      Mask.push_back(ExtractElt - VEXTOffsets[0]);
3999    } else {
4000      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
4001    }
4002  }
4003
4004  // Final check before we try to produce nonsense...
4005  if (isShuffleMaskLegal(Mask, VT))
4006    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
4007                                &Mask[0]);
4008
4009  return SDValue();
4010}
4011
4012/// isShuffleMaskLegal - Targets can use this to indicate that they only
4013/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
4014/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
4015/// are assumed to be legal.
4016bool
4017ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
4018                                      EVT VT) const {
4019  if (VT.getVectorNumElements() == 4 &&
4020      (VT.is128BitVector() || VT.is64BitVector())) {
4021    unsigned PFIndexes[4];
4022    for (unsigned i = 0; i != 4; ++i) {
4023      if (M[i] < 0)
4024        PFIndexes[i] = 8;
4025      else
4026        PFIndexes[i] = M[i];
4027    }
4028
4029    // Compute the index in the perfect shuffle table.
4030    unsigned PFTableIndex =
4031      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
4032    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4033    unsigned Cost = (PFEntry >> 30);
4034
4035    if (Cost <= 4)
4036      return true;
4037  }
4038
4039  bool ReverseVEXT;
4040  unsigned Imm, WhichResult;
4041
4042  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4043  return (EltSize >= 32 ||
4044          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
4045          isVREVMask(M, VT, 64) ||
4046          isVREVMask(M, VT, 32) ||
4047          isVREVMask(M, VT, 16) ||
4048          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
4049          isVTBLMask(M, VT) ||
4050          isVTRNMask(M, VT, WhichResult) ||
4051          isVUZPMask(M, VT, WhichResult) ||
4052          isVZIPMask(M, VT, WhichResult) ||
4053          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
4054          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
4055          isVZIP_v_undef_Mask(M, VT, WhichResult));
4056}
4057
4058/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
4059/// the specified operations to build the shuffle.
4060static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
4061                                      SDValue RHS, SelectionDAG &DAG,
4062                                      DebugLoc dl) {
4063  unsigned OpNum = (PFEntry >> 26) & 0x0F;
4064  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
4065  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
4066
4067  enum {
4068    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
4069    OP_VREV,
4070    OP_VDUP0,
4071    OP_VDUP1,
4072    OP_VDUP2,
4073    OP_VDUP3,
4074    OP_VEXT1,
4075    OP_VEXT2,
4076    OP_VEXT3,
4077    OP_VUZPL, // VUZP, left result
4078    OP_VUZPR, // VUZP, right result
4079    OP_VZIPL, // VZIP, left result
4080    OP_VZIPR, // VZIP, right result
4081    OP_VTRNL, // VTRN, left result
4082    OP_VTRNR  // VTRN, right result
4083  };
4084
4085  if (OpNum == OP_COPY) {
4086    if (LHSID == (1*9+2)*9+3) return LHS;
4087    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
4088    return RHS;
4089  }
4090
4091  SDValue OpLHS, OpRHS;
4092  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
4093  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
4094  EVT VT = OpLHS.getValueType();
4095
4096  switch (OpNum) {
4097  default: llvm_unreachable("Unknown shuffle opcode!");
4098  case OP_VREV:
4099    return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
4100  case OP_VDUP0:
4101  case OP_VDUP1:
4102  case OP_VDUP2:
4103  case OP_VDUP3:
4104    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
4105                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
4106  case OP_VEXT1:
4107  case OP_VEXT2:
4108  case OP_VEXT3:
4109    return DAG.getNode(ARMISD::VEXT, dl, VT,
4110                       OpLHS, OpRHS,
4111                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
4112  case OP_VUZPL:
4113  case OP_VUZPR:
4114    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4115                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
4116  case OP_VZIPL:
4117  case OP_VZIPR:
4118    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4119                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
4120  case OP_VTRNL:
4121  case OP_VTRNR:
4122    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4123                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
4124  }
4125}
4126
4127static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
4128                                       SmallVectorImpl<int> &ShuffleMask,
4129                                       SelectionDAG &DAG) {
4130  // Check to see if we can use the VTBL instruction.
4131  SDValue V1 = Op.getOperand(0);
4132  SDValue V2 = Op.getOperand(1);
4133  DebugLoc DL = Op.getDebugLoc();
4134
4135  SmallVector<SDValue, 8> VTBLMask;
4136  for (SmallVectorImpl<int>::iterator
4137         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
4138    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
4139
4140  if (V2.getNode()->getOpcode() == ISD::UNDEF)
4141    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
4142                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
4143                                   &VTBLMask[0], 8));
4144
4145  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
4146                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
4147                                 &VTBLMask[0], 8));
4148}
4149
4150static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4151  SDValue V1 = Op.getOperand(0);
4152  SDValue V2 = Op.getOperand(1);
4153  DebugLoc dl = Op.getDebugLoc();
4154  EVT VT = Op.getValueType();
4155  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
4156  SmallVector<int, 8> ShuffleMask;
4157
4158  // Convert shuffles that are directly supported on NEON to target-specific
4159  // DAG nodes, instead of keeping them as shuffles and matching them again
4160  // during code selection.  This is more efficient and avoids the possibility
4161  // of inconsistencies between legalization and selection.
4162  // FIXME: floating-point vectors should be canonicalized to integer vectors
4163  // of the same time so that they get CSEd properly.
4164  SVN->getMask(ShuffleMask);
4165
4166  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4167  if (EltSize <= 32) {
4168    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
4169      int Lane = SVN->getSplatIndex();
4170      // If this is undef splat, generate it via "just" vdup, if possible.
4171      if (Lane == -1) Lane = 0;
4172
4173      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4174        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
4175      }
4176      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
4177                         DAG.getConstant(Lane, MVT::i32));
4178    }
4179
4180    bool ReverseVEXT;
4181    unsigned Imm;
4182    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
4183      if (ReverseVEXT)
4184        std::swap(V1, V2);
4185      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
4186                         DAG.getConstant(Imm, MVT::i32));
4187    }
4188
4189    if (isVREVMask(ShuffleMask, VT, 64))
4190      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
4191    if (isVREVMask(ShuffleMask, VT, 32))
4192      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
4193    if (isVREVMask(ShuffleMask, VT, 16))
4194      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
4195
4196    // Check for Neon shuffles that modify both input vectors in place.
4197    // If both results are used, i.e., if there are two shuffles with the same
4198    // source operands and with masks corresponding to both results of one of
4199    // these operations, DAG memoization will ensure that a single node is
4200    // used for both shuffles.
4201    unsigned WhichResult;
4202    if (isVTRNMask(ShuffleMask, VT, WhichResult))
4203      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4204                         V1, V2).getValue(WhichResult);
4205    if (isVUZPMask(ShuffleMask, VT, WhichResult))
4206      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4207                         V1, V2).getValue(WhichResult);
4208    if (isVZIPMask(ShuffleMask, VT, WhichResult))
4209      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4210                         V1, V2).getValue(WhichResult);
4211
4212    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
4213      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
4214                         V1, V1).getValue(WhichResult);
4215    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
4216      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
4217                         V1, V1).getValue(WhichResult);
4218    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
4219      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
4220                         V1, V1).getValue(WhichResult);
4221  }
4222
4223  // If the shuffle is not directly supported and it has 4 elements, use
4224  // the PerfectShuffle-generated table to synthesize it from other shuffles.
4225  unsigned NumElts = VT.getVectorNumElements();
4226  if (NumElts == 4) {
4227    unsigned PFIndexes[4];
4228    for (unsigned i = 0; i != 4; ++i) {
4229      if (ShuffleMask[i] < 0)
4230        PFIndexes[i] = 8;
4231      else
4232        PFIndexes[i] = ShuffleMask[i];
4233    }
4234
4235    // Compute the index in the perfect shuffle table.
4236    unsigned PFTableIndex =
4237      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
4238    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
4239    unsigned Cost = (PFEntry >> 30);
4240
4241    if (Cost <= 4)
4242      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
4243  }
4244
4245  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
4246  if (EltSize >= 32) {
4247    // Do the expansion with floating-point types, since that is what the VFP
4248    // registers are defined to use, and since i64 is not legal.
4249    EVT EltVT = EVT::getFloatingPointVT(EltSize);
4250    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
4251    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
4252    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
4253    SmallVector<SDValue, 8> Ops;
4254    for (unsigned i = 0; i < NumElts; ++i) {
4255      if (ShuffleMask[i] < 0)
4256        Ops.push_back(DAG.getUNDEF(EltVT));
4257      else
4258        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
4259                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
4260                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
4261                                                  MVT::i32)));
4262    }
4263    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
4264    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
4265  }
4266
4267  if (VT == MVT::v8i8) {
4268    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
4269    if (NewOp.getNode())
4270      return NewOp;
4271  }
4272
4273  return SDValue();
4274}
4275
4276static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4277  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
4278  SDValue Lane = Op.getOperand(1);
4279  if (!isa<ConstantSDNode>(Lane))
4280    return SDValue();
4281
4282  SDValue Vec = Op.getOperand(0);
4283  if (Op.getValueType() == MVT::i32 &&
4284      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
4285    DebugLoc dl = Op.getDebugLoc();
4286    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
4287  }
4288
4289  return Op;
4290}
4291
4292static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
4293  // The only time a CONCAT_VECTORS operation can have legal types is when
4294  // two 64-bit vectors are concatenated to a 128-bit vector.
4295  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
4296         "unexpected CONCAT_VECTORS");
4297  DebugLoc dl = Op.getDebugLoc();
4298  SDValue Val = DAG.getUNDEF(MVT::v2f64);
4299  SDValue Op0 = Op.getOperand(0);
4300  SDValue Op1 = Op.getOperand(1);
4301  if (Op0.getOpcode() != ISD::UNDEF)
4302    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
4303                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
4304                      DAG.getIntPtrConstant(0));
4305  if (Op1.getOpcode() != ISD::UNDEF)
4306    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
4307                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
4308                      DAG.getIntPtrConstant(1));
4309  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
4310}
4311
4312/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
4313/// element has been zero/sign-extended, depending on the isSigned parameter,
4314/// from an integer type half its size.
4315static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4316                                   bool isSigned) {
4317  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
4318  EVT VT = N->getValueType(0);
4319  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
4320    SDNode *BVN = N->getOperand(0).getNode();
4321    if (BVN->getValueType(0) != MVT::v4i32 ||
4322        BVN->getOpcode() != ISD::BUILD_VECTOR)
4323      return false;
4324    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
4325    unsigned HiElt = 1 - LoElt;
4326    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
4327    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
4328    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
4329    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
4330    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
4331      return false;
4332    if (isSigned) {
4333      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
4334          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
4335        return true;
4336    } else {
4337      if (Hi0->isNullValue() && Hi1->isNullValue())
4338        return true;
4339    }
4340    return false;
4341  }
4342
4343  if (N->getOpcode() != ISD::BUILD_VECTOR)
4344    return false;
4345
4346  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
4347    SDNode *Elt = N->getOperand(i).getNode();
4348    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4349      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4350      unsigned HalfSize = EltSize / 2;
4351      if (isSigned) {
4352        int64_t SExtVal = C->getSExtValue();
4353        if ((SExtVal >> HalfSize) != (SExtVal >> EltSize))
4354          return false;
4355      } else {
4356        if ((C->getZExtValue() >> HalfSize) != 0)
4357          return false;
4358      }
4359      continue;
4360    }
4361    return false;
4362  }
4363
4364  return true;
4365}
4366
4367/// isSignExtended - Check if a node is a vector value that is sign-extended
4368/// or a constant BUILD_VECTOR with sign-extended elements.
4369static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4370  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
4371    return true;
4372  if (isExtendedBUILD_VECTOR(N, DAG, true))
4373    return true;
4374  return false;
4375}
4376
4377/// isZeroExtended - Check if a node is a vector value that is zero-extended
4378/// or a constant BUILD_VECTOR with zero-extended elements.
4379static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4380  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
4381    return true;
4382  if (isExtendedBUILD_VECTOR(N, DAG, false))
4383    return true;
4384  return false;
4385}
4386
4387/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
4388/// load, or BUILD_VECTOR with extended elements, return the unextended value.
4389static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
4390  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
4391    return N->getOperand(0);
4392  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
4393    return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
4394                       LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
4395                       LD->isNonTemporal(), LD->getAlignment());
4396  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
4397  // have been legalized as a BITCAST from v4i32.
4398  if (N->getOpcode() == ISD::BITCAST) {
4399    SDNode *BVN = N->getOperand(0).getNode();
4400    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
4401           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
4402    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
4403    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
4404                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
4405  }
4406  // Construct a new BUILD_VECTOR with elements truncated to half the size.
4407  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4408  EVT VT = N->getValueType(0);
4409  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
4410  unsigned NumElts = VT.getVectorNumElements();
4411  MVT TruncVT = MVT::getIntegerVT(EltSize);
4412  SmallVector<SDValue, 8> Ops;
4413  for (unsigned i = 0; i != NumElts; ++i) {
4414    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4415    const APInt &CInt = C->getAPIntValue();
4416    Ops.push_back(DAG.getConstant(CInt.trunc(EltSize), TruncVT));
4417  }
4418  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
4419                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
4420}
4421
4422static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4423  unsigned Opcode = N->getOpcode();
4424  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4425    SDNode *N0 = N->getOperand(0).getNode();
4426    SDNode *N1 = N->getOperand(1).getNode();
4427    return N0->hasOneUse() && N1->hasOneUse() &&
4428      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4429  }
4430  return false;
4431}
4432
4433static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4434  unsigned Opcode = N->getOpcode();
4435  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4436    SDNode *N0 = N->getOperand(0).getNode();
4437    SDNode *N1 = N->getOperand(1).getNode();
4438    return N0->hasOneUse() && N1->hasOneUse() &&
4439      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4440  }
4441  return false;
4442}
4443
4444static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
4445  // Multiplications are only custom-lowered for 128-bit vectors so that
4446  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
4447  EVT VT = Op.getValueType();
4448  assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
4449  SDNode *N0 = Op.getOperand(0).getNode();
4450  SDNode *N1 = Op.getOperand(1).getNode();
4451  unsigned NewOpc = 0;
4452  bool isMLA = false;
4453  bool isN0SExt = isSignExtended(N0, DAG);
4454  bool isN1SExt = isSignExtended(N1, DAG);
4455  if (isN0SExt && isN1SExt)
4456    NewOpc = ARMISD::VMULLs;
4457  else {
4458    bool isN0ZExt = isZeroExtended(N0, DAG);
4459    bool isN1ZExt = isZeroExtended(N1, DAG);
4460    if (isN0ZExt && isN1ZExt)
4461      NewOpc = ARMISD::VMULLu;
4462    else if (isN1SExt || isN1ZExt) {
4463      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4464      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4465      if (isN1SExt && isAddSubSExt(N0, DAG)) {
4466        NewOpc = ARMISD::VMULLs;
4467        isMLA = true;
4468      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
4469        NewOpc = ARMISD::VMULLu;
4470        isMLA = true;
4471      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
4472        std::swap(N0, N1);
4473        NewOpc = ARMISD::VMULLu;
4474        isMLA = true;
4475      }
4476    }
4477
4478    if (!NewOpc) {
4479      if (VT == MVT::v2i64)
4480        // Fall through to expand this.  It is not legal.
4481        return SDValue();
4482      else
4483        // Other vector multiplications are legal.
4484        return Op;
4485    }
4486  }
4487
4488  // Legalize to a VMULL instruction.
4489  DebugLoc DL = Op.getDebugLoc();
4490  SDValue Op0;
4491  SDValue Op1 = SkipExtension(N1, DAG);
4492  if (!isMLA) {
4493    Op0 = SkipExtension(N0, DAG);
4494    assert(Op0.getValueType().is64BitVector() &&
4495           Op1.getValueType().is64BitVector() &&
4496           "unexpected types for extended operands to VMULL");
4497    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4498  }
4499
4500  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
4501  // isel lowering to take advantage of no-stall back to back vmul + vmla.
4502  //   vmull q0, d4, d6
4503  //   vmlal q0, d5, d6
4504  // is faster than
4505  //   vaddl q0, d4, d5
4506  //   vmovl q1, d6
4507  //   vmul  q0, q0, q1
4508  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
4509  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
4510  EVT Op1VT = Op1.getValueType();
4511  return DAG.getNode(N0->getOpcode(), DL, VT,
4512                     DAG.getNode(NewOpc, DL, VT,
4513                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4514                     DAG.getNode(NewOpc, DL, VT,
4515                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4516}
4517
4518static SDValue
4519LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
4520  // Convert to float
4521  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
4522  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
4523  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
4524  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
4525  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
4526  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
4527  // Get reciprocal estimate.
4528  // float4 recip = vrecpeq_f32(yf);
4529  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4530                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
4531  // Because char has a smaller range than uchar, we can actually get away
4532  // without any newton steps.  This requires that we use a weird bias
4533  // of 0xb000, however (again, this has been exhaustively tested).
4534  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
4535  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
4536  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
4537  Y = DAG.getConstant(0xb000, MVT::i32);
4538  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
4539  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
4540  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
4541  // Convert back to short.
4542  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
4543  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
4544  return X;
4545}
4546
4547static SDValue
4548LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
4549  SDValue N2;
4550  // Convert to float.
4551  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
4552  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
4553  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
4554  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
4555  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
4556  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
4557
4558  // Use reciprocal estimate and one refinement step.
4559  // float4 recip = vrecpeq_f32(yf);
4560  // recip *= vrecpsq_f32(yf, recip);
4561  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4562                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
4563  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4564                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
4565                   N1, N2);
4566  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
4567  // Because short has a smaller range than ushort, we can actually get away
4568  // with only a single newton step.  This requires that we use a weird bias
4569  // of 89, however (again, this has been exhaustively tested).
4570  // float4 result = as_float4(as_int4(xf*recip) + 89);
4571  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
4572  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
4573  N1 = DAG.getConstant(89, MVT::i32);
4574  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
4575  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
4576  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
4577  // Convert back to integer and return.
4578  // return vmovn_s32(vcvt_s32_f32(result));
4579  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
4580  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
4581  return N0;
4582}
4583
4584static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
4585  EVT VT = Op.getValueType();
4586  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
4587         "unexpected type for custom-lowering ISD::SDIV");
4588
4589  DebugLoc dl = Op.getDebugLoc();
4590  SDValue N0 = Op.getOperand(0);
4591  SDValue N1 = Op.getOperand(1);
4592  SDValue N2, N3;
4593
4594  if (VT == MVT::v8i8) {
4595    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
4596    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
4597
4598    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
4599                     DAG.getIntPtrConstant(4));
4600    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
4601                     DAG.getIntPtrConstant(4));
4602    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
4603                     DAG.getIntPtrConstant(0));
4604    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
4605                     DAG.getIntPtrConstant(0));
4606
4607    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
4608    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
4609
4610    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
4611    N0 = LowerCONCAT_VECTORS(N0, DAG);
4612
4613    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
4614    return N0;
4615  }
4616  return LowerSDIV_v4i16(N0, N1, dl, DAG);
4617}
4618
4619static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
4620  EVT VT = Op.getValueType();
4621  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
4622         "unexpected type for custom-lowering ISD::UDIV");
4623
4624  DebugLoc dl = Op.getDebugLoc();
4625  SDValue N0 = Op.getOperand(0);
4626  SDValue N1 = Op.getOperand(1);
4627  SDValue N2, N3;
4628
4629  if (VT == MVT::v8i8) {
4630    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
4631    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
4632
4633    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
4634                     DAG.getIntPtrConstant(4));
4635    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
4636                     DAG.getIntPtrConstant(4));
4637    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
4638                     DAG.getIntPtrConstant(0));
4639    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
4640                     DAG.getIntPtrConstant(0));
4641
4642    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
4643    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
4644
4645    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
4646    N0 = LowerCONCAT_VECTORS(N0, DAG);
4647
4648    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
4649                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
4650                     N0);
4651    return N0;
4652  }
4653
4654  // v4i16 sdiv ... Convert to float.
4655  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
4656  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
4657  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
4658  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
4659  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
4660  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
4661
4662  // Use reciprocal estimate and two refinement steps.
4663  // float4 recip = vrecpeq_f32(yf);
4664  // recip *= vrecpsq_f32(yf, recip);
4665  // recip *= vrecpsq_f32(yf, recip);
4666  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4667                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
4668  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4669                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
4670                   N1, N2);
4671  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
4672  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
4673                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
4674                   N1, N2);
4675  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
4676  // Simply multiplying by the reciprocal estimate can leave us a few ulps
4677  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
4678  // and that it will never cause us to return an answer too large).
4679  // float4 result = as_float4(as_int4(xf*recip) + 89);
4680  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
4681  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
4682  N1 = DAG.getConstant(2, MVT::i32);
4683  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
4684  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
4685  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
4686  // Convert back to integer and return.
4687  // return vmovn_u32(vcvt_s32_f32(result));
4688  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
4689  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
4690  return N0;
4691}
4692
4693SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
4694  switch (Op.getOpcode()) {
4695  default: llvm_unreachable("Don't know how to custom lower this!");
4696  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
4697  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
4698  case ISD::GlobalAddress:
4699    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
4700      LowerGlobalAddressELF(Op, DAG);
4701  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
4702  case ISD::SELECT:        return LowerSELECT(Op, DAG);
4703  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
4704  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
4705  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
4706  case ISD::VASTART:       return LowerVASTART(Op, DAG);
4707  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
4708  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
4709  case ISD::SINT_TO_FP:
4710  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
4711  case ISD::FP_TO_SINT:
4712  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
4713  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
4714  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
4715  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
4716  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
4717  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
4718  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
4719  case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG);
4720  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
4721                                                               Subtarget);
4722  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
4723  case ISD::SHL:
4724  case ISD::SRL:
4725  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
4726  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
4727  case ISD::SRL_PARTS:
4728  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
4729  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
4730  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
4731  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
4732  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
4733  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
4734  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
4735  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
4736  case ISD::MUL:           return LowerMUL(Op, DAG);
4737  case ISD::SDIV:          return LowerSDIV(Op, DAG);
4738  case ISD::UDIV:          return LowerUDIV(Op, DAG);
4739  }
4740  return SDValue();
4741}
4742
4743/// ReplaceNodeResults - Replace the results of node with an illegal result
4744/// type with new values built out of custom code.
4745void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
4746                                           SmallVectorImpl<SDValue>&Results,
4747                                           SelectionDAG &DAG) const {
4748  SDValue Res;
4749  switch (N->getOpcode()) {
4750  default:
4751    llvm_unreachable("Don't know how to custom expand this!");
4752    break;
4753  case ISD::BITCAST:
4754    Res = ExpandBITCAST(N, DAG);
4755    break;
4756  case ISD::SRL:
4757  case ISD::SRA:
4758    Res = Expand64BitShift(N, DAG, Subtarget);
4759    break;
4760  }
4761  if (Res.getNode())
4762    Results.push_back(Res);
4763}
4764
4765//===----------------------------------------------------------------------===//
4766//                           ARM Scheduler Hooks
4767//===----------------------------------------------------------------------===//
4768
4769MachineBasicBlock *
4770ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
4771                                     MachineBasicBlock *BB,
4772                                     unsigned Size) const {
4773  unsigned dest    = MI->getOperand(0).getReg();
4774  unsigned ptr     = MI->getOperand(1).getReg();
4775  unsigned oldval  = MI->getOperand(2).getReg();
4776  unsigned newval  = MI->getOperand(3).getReg();
4777  unsigned scratch = BB->getParent()->getRegInfo()
4778    .createVirtualRegister(ARM::GPRRegisterClass);
4779  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
4780  DebugLoc dl = MI->getDebugLoc();
4781  bool isThumb2 = Subtarget->isThumb2();
4782
4783  unsigned ldrOpc, strOpc;
4784  switch (Size) {
4785  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
4786  case 1:
4787    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
4788    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
4789    break;
4790  case 2:
4791    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
4792    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
4793    break;
4794  case 4:
4795    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
4796    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
4797    break;
4798  }
4799
4800  MachineFunction *MF = BB->getParent();
4801  const BasicBlock *LLVM_BB = BB->getBasicBlock();
4802  MachineFunction::iterator It = BB;
4803  ++It; // insert the new blocks after the current block
4804
4805  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
4806  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
4807  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
4808  MF->insert(It, loop1MBB);
4809  MF->insert(It, loop2MBB);
4810  MF->insert(It, exitMBB);
4811
4812  // Transfer the remainder of BB and its successor edges to exitMBB.
4813  exitMBB->splice(exitMBB->begin(), BB,
4814                  llvm::next(MachineBasicBlock::iterator(MI)),
4815                  BB->end());
4816  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
4817
4818  //  thisMBB:
4819  //   ...
4820  //   fallthrough --> loop1MBB
4821  BB->addSuccessor(loop1MBB);
4822
4823  // loop1MBB:
4824  //   ldrex dest, [ptr]
4825  //   cmp dest, oldval
4826  //   bne exitMBB
4827  BB = loop1MBB;
4828  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
4829  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
4830                 .addReg(dest).addReg(oldval));
4831  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
4832    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
4833  BB->addSuccessor(loop2MBB);
4834  BB->addSuccessor(exitMBB);
4835
4836  // loop2MBB:
4837  //   strex scratch, newval, [ptr]
4838  //   cmp scratch, #0
4839  //   bne loop1MBB
4840  BB = loop2MBB;
4841  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval)
4842                 .addReg(ptr));
4843  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
4844                 .addReg(scratch).addImm(0));
4845  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
4846    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
4847  BB->addSuccessor(loop1MBB);
4848  BB->addSuccessor(exitMBB);
4849
4850  //  exitMBB:
4851  //   ...
4852  BB = exitMBB;
4853
4854  MI->eraseFromParent();   // The instruction is gone now.
4855
4856  return BB;
4857}
4858
4859MachineBasicBlock *
4860ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
4861                                    unsigned Size, unsigned BinOpcode) const {
4862  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
4863  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
4864
4865  const BasicBlock *LLVM_BB = BB->getBasicBlock();
4866  MachineFunction *MF = BB->getParent();
4867  MachineFunction::iterator It = BB;
4868  ++It;
4869
4870  unsigned dest = MI->getOperand(0).getReg();
4871  unsigned ptr = MI->getOperand(1).getReg();
4872  unsigned incr = MI->getOperand(2).getReg();
4873  DebugLoc dl = MI->getDebugLoc();
4874
4875  bool isThumb2 = Subtarget->isThumb2();
4876  unsigned ldrOpc, strOpc;
4877  switch (Size) {
4878  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
4879  case 1:
4880    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
4881    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
4882    break;
4883  case 2:
4884    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
4885    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
4886    break;
4887  case 4:
4888    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
4889    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
4890    break;
4891  }
4892
4893  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
4894  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
4895  MF->insert(It, loopMBB);
4896  MF->insert(It, exitMBB);
4897
4898  // Transfer the remainder of BB and its successor edges to exitMBB.
4899  exitMBB->splice(exitMBB->begin(), BB,
4900                  llvm::next(MachineBasicBlock::iterator(MI)),
4901                  BB->end());
4902  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
4903
4904  MachineRegisterInfo &RegInfo = MF->getRegInfo();
4905  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
4906  unsigned scratch2 = (!BinOpcode) ? incr :
4907    RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
4908
4909  //  thisMBB:
4910  //   ...
4911  //   fallthrough --> loopMBB
4912  BB->addSuccessor(loopMBB);
4913
4914  //  loopMBB:
4915  //   ldrex dest, ptr
4916  //   <binop> scratch2, dest, incr
4917  //   strex scratch, scratch2, ptr
4918  //   cmp scratch, #0
4919  //   bne- loopMBB
4920  //   fallthrough --> exitMBB
4921  BB = loopMBB;
4922  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
4923  if (BinOpcode) {
4924    // operand order needs to go the other way for NAND
4925    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
4926      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
4927                     addReg(incr).addReg(dest)).addReg(0);
4928    else
4929      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
4930                     addReg(dest).addReg(incr)).addReg(0);
4931  }
4932
4933  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
4934                 .addReg(ptr));
4935  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
4936                 .addReg(scratch).addImm(0));
4937  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
4938    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
4939
4940  BB->addSuccessor(loopMBB);
4941  BB->addSuccessor(exitMBB);
4942
4943  //  exitMBB:
4944  //   ...
4945  BB = exitMBB;
4946
4947  MI->eraseFromParent();   // The instruction is gone now.
4948
4949  return BB;
4950}
4951
4952static
4953MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
4954  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
4955       E = MBB->succ_end(); I != E; ++I)
4956    if (*I != Succ)
4957      return *I;
4958  llvm_unreachable("Expecting a BB with two successors!");
4959}
4960
4961MachineBasicBlock *
4962ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
4963                                               MachineBasicBlock *BB) const {
4964  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
4965  DebugLoc dl = MI->getDebugLoc();
4966  bool isThumb2 = Subtarget->isThumb2();
4967  switch (MI->getOpcode()) {
4968  default:
4969    MI->dump();
4970    llvm_unreachable("Unexpected instr type to insert");
4971
4972  case ARM::ATOMIC_LOAD_ADD_I8:
4973     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
4974  case ARM::ATOMIC_LOAD_ADD_I16:
4975     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
4976  case ARM::ATOMIC_LOAD_ADD_I32:
4977     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
4978
4979  case ARM::ATOMIC_LOAD_AND_I8:
4980     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
4981  case ARM::ATOMIC_LOAD_AND_I16:
4982     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
4983  case ARM::ATOMIC_LOAD_AND_I32:
4984     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
4985
4986  case ARM::ATOMIC_LOAD_OR_I8:
4987     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
4988  case ARM::ATOMIC_LOAD_OR_I16:
4989     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
4990  case ARM::ATOMIC_LOAD_OR_I32:
4991     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
4992
4993  case ARM::ATOMIC_LOAD_XOR_I8:
4994     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
4995  case ARM::ATOMIC_LOAD_XOR_I16:
4996     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
4997  case ARM::ATOMIC_LOAD_XOR_I32:
4998     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
4999
5000  case ARM::ATOMIC_LOAD_NAND_I8:
5001     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
5002  case ARM::ATOMIC_LOAD_NAND_I16:
5003     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
5004  case ARM::ATOMIC_LOAD_NAND_I32:
5005     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
5006
5007  case ARM::ATOMIC_LOAD_SUB_I8:
5008     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
5009  case ARM::ATOMIC_LOAD_SUB_I16:
5010     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
5011  case ARM::ATOMIC_LOAD_SUB_I32:
5012     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
5013
5014  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
5015  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
5016  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
5017
5018  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
5019  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
5020  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
5021
5022  case ARM::ADCSSri:
5023  case ARM::ADCSSrr:
5024  case ARM::ADCSSrs:
5025  case ARM::SBCSSri:
5026  case ARM::SBCSSrr:
5027  case ARM::SBCSSrs:
5028  case ARM::RSBSri:
5029  case ARM::RSBSrr:
5030  case ARM::RSBSrs:
5031  case ARM::RSCSri:
5032  case ARM::RSCSrs: {
5033    unsigned OldOpc = MI->getOpcode();
5034    unsigned Opc = 0;
5035    switch (OldOpc) {
5036      case ARM::ADCSSrr:
5037        Opc = ARM::ADCrr;
5038        break;
5039      case ARM::ADCSSri:
5040        Opc = ARM::ADCri;
5041        break;
5042      case ARM::ADCSSrs:
5043        Opc = ARM::ADCrs;
5044        break;
5045      case ARM::SBCSSrr:
5046        Opc = ARM::SBCrr;
5047        break;
5048      case ARM::SBCSSri:
5049        Opc = ARM::SBCri;
5050        break;
5051      case ARM::SBCSSrs:
5052        Opc = ARM::SBCrs;
5053        break;
5054      case ARM::RSBSri:
5055        Opc = ARM::RSBri;
5056        break;
5057      case ARM::RSBSrr:
5058        Opc = ARM::RSBrr;
5059        break;
5060      case ARM::RSBSrs:
5061        Opc = ARM::RSBrs;
5062        break;
5063      case ARM::RSCSri:
5064        Opc = ARM::RSCri;
5065        break;
5066      case ARM::RSCSrs:
5067        Opc = ARM::RSCrs;
5068        break;
5069      default:
5070        llvm_unreachable("Unknown opcode?");
5071    }
5072
5073    MachineInstrBuilder MIB =
5074      BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(Opc));
5075    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
5076      MIB.addOperand(MI->getOperand(i));
5077    AddDefaultPred(MIB);
5078    MIB.addReg(ARM::CPSR, RegState::Define); // S bit
5079    MI->eraseFromParent();
5080    return BB;
5081  }
5082
5083
5084  case ARM::tMOVCCr_pseudo: {
5085    // To "insert" a SELECT_CC instruction, we actually have to insert the
5086    // diamond control-flow pattern.  The incoming instruction knows the
5087    // destination vreg to set, the condition code register to branch on, the
5088    // true/false values to select between, and a branch opcode to use.
5089    const BasicBlock *LLVM_BB = BB->getBasicBlock();
5090    MachineFunction::iterator It = BB;
5091    ++It;
5092
5093    //  thisMBB:
5094    //  ...
5095    //   TrueVal = ...
5096    //   cmpTY ccX, r1, r2
5097    //   bCC copy1MBB
5098    //   fallthrough --> copy0MBB
5099    MachineBasicBlock *thisMBB  = BB;
5100    MachineFunction *F = BB->getParent();
5101    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
5102    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
5103    F->insert(It, copy0MBB);
5104    F->insert(It, sinkMBB);
5105
5106    // Transfer the remainder of BB and its successor edges to sinkMBB.
5107    sinkMBB->splice(sinkMBB->begin(), BB,
5108                    llvm::next(MachineBasicBlock::iterator(MI)),
5109                    BB->end());
5110    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
5111
5112    BB->addSuccessor(copy0MBB);
5113    BB->addSuccessor(sinkMBB);
5114
5115    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
5116      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
5117
5118    //  copy0MBB:
5119    //   %FalseValue = ...
5120    //   # fallthrough to sinkMBB
5121    BB = copy0MBB;
5122
5123    // Update machine-CFG edges
5124    BB->addSuccessor(sinkMBB);
5125
5126    //  sinkMBB:
5127    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
5128    //  ...
5129    BB = sinkMBB;
5130    BuildMI(*BB, BB->begin(), dl,
5131            TII->get(ARM::PHI), MI->getOperand(0).getReg())
5132      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
5133      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
5134
5135    MI->eraseFromParent();   // The pseudo instruction is gone now.
5136    return BB;
5137  }
5138
5139  case ARM::BCCi64:
5140  case ARM::BCCZi64: {
5141    // If there is an unconditional branch to the other successor, remove it.
5142    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
5143
5144    // Compare both parts that make up the double comparison separately for
5145    // equality.
5146    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
5147
5148    unsigned LHS1 = MI->getOperand(1).getReg();
5149    unsigned LHS2 = MI->getOperand(2).getReg();
5150    if (RHSisZero) {
5151      AddDefaultPred(BuildMI(BB, dl,
5152                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5153                     .addReg(LHS1).addImm(0));
5154      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
5155        .addReg(LHS2).addImm(0)
5156        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
5157    } else {
5158      unsigned RHS1 = MI->getOperand(3).getReg();
5159      unsigned RHS2 = MI->getOperand(4).getReg();
5160      AddDefaultPred(BuildMI(BB, dl,
5161                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
5162                     .addReg(LHS1).addReg(RHS1));
5163      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
5164        .addReg(LHS2).addReg(RHS2)
5165        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
5166    }
5167
5168    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
5169    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
5170    if (MI->getOperand(0).getImm() == ARMCC::NE)
5171      std::swap(destMBB, exitMBB);
5172
5173    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
5174      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
5175    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B))
5176      .addMBB(exitMBB);
5177
5178    MI->eraseFromParent();   // The pseudo instruction is gone now.
5179    return BB;
5180  }
5181  }
5182}
5183
5184//===----------------------------------------------------------------------===//
5185//                           ARM Optimization Hooks
5186//===----------------------------------------------------------------------===//
5187
5188static
5189SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
5190                            TargetLowering::DAGCombinerInfo &DCI) {
5191  SelectionDAG &DAG = DCI.DAG;
5192  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5193  EVT VT = N->getValueType(0);
5194  unsigned Opc = N->getOpcode();
5195  bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
5196  SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
5197  SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2);
5198  ISD::CondCode CC = ISD::SETCC_INVALID;
5199
5200  if (isSlctCC) {
5201    CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get();
5202  } else {
5203    SDValue CCOp = Slct.getOperand(0);
5204    if (CCOp.getOpcode() == ISD::SETCC)
5205      CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get();
5206  }
5207
5208  bool DoXform = false;
5209  bool InvCC = false;
5210  assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) &&
5211          "Bad input!");
5212
5213  if (LHS.getOpcode() == ISD::Constant &&
5214      cast<ConstantSDNode>(LHS)->isNullValue()) {
5215    DoXform = true;
5216  } else if (CC != ISD::SETCC_INVALID &&
5217             RHS.getOpcode() == ISD::Constant &&
5218             cast<ConstantSDNode>(RHS)->isNullValue()) {
5219    std::swap(LHS, RHS);
5220    SDValue Op0 = Slct.getOperand(0);
5221    EVT OpVT = isSlctCC ? Op0.getValueType() :
5222                          Op0.getOperand(0).getValueType();
5223    bool isInt = OpVT.isInteger();
5224    CC = ISD::getSetCCInverse(CC, isInt);
5225
5226    if (!TLI.isCondCodeLegal(CC, OpVT))
5227      return SDValue();         // Inverse operator isn't legal.
5228
5229    DoXform = true;
5230    InvCC = true;
5231  }
5232
5233  if (DoXform) {
5234    SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS);
5235    if (isSlctCC)
5236      return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result,
5237                             Slct.getOperand(0), Slct.getOperand(1), CC);
5238    SDValue CCOp = Slct.getOperand(0);
5239    if (InvCC)
5240      CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(),
5241                          CCOp.getOperand(0), CCOp.getOperand(1), CC);
5242    return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
5243                       CCOp, OtherOp, Result);
5244  }
5245  return SDValue();
5246}
5247
5248/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5249/// operands N0 and N1.  This is a helper for PerformADDCombine that is
5250/// called with the default operands, and if that fails, with commuted
5251/// operands.
5252static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
5253                                         TargetLowering::DAGCombinerInfo &DCI) {
5254  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
5255  if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
5256    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
5257    if (Result.getNode()) return Result;
5258  }
5259  return SDValue();
5260}
5261
5262/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5263///
5264static SDValue PerformADDCombine(SDNode *N,
5265                                 TargetLowering::DAGCombinerInfo &DCI) {
5266  SDValue N0 = N->getOperand(0);
5267  SDValue N1 = N->getOperand(1);
5268
5269  // First try with the default operand order.
5270  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI);
5271  if (Result.getNode())
5272    return Result;
5273
5274  // If that didn't work, try again with the operands commuted.
5275  return PerformADDCombineWithOperands(N, N1, N0, DCI);
5276}
5277
5278/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
5279///
5280static SDValue PerformSUBCombine(SDNode *N,
5281                                 TargetLowering::DAGCombinerInfo &DCI) {
5282  SDValue N0 = N->getOperand(0);
5283  SDValue N1 = N->getOperand(1);
5284
5285  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
5286  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
5287    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
5288    if (Result.getNode()) return Result;
5289  }
5290
5291  return SDValue();
5292}
5293
5294/// PerformVMULCombine
5295/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
5296/// special multiplier accumulator forwarding.
5297///   vmul d3, d0, d2
5298///   vmla d3, d1, d2
5299/// is faster than
5300///   vadd d3, d0, d1
5301///   vmul d3, d3, d2
5302static SDValue PerformVMULCombine(SDNode *N,
5303                                  TargetLowering::DAGCombinerInfo &DCI,
5304                                  const ARMSubtarget *Subtarget) {
5305  if (!Subtarget->hasVMLxForwarding())
5306    return SDValue();
5307
5308  SelectionDAG &DAG = DCI.DAG;
5309  SDValue N0 = N->getOperand(0);
5310  SDValue N1 = N->getOperand(1);
5311  unsigned Opcode = N0.getOpcode();
5312  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
5313      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
5314    Opcode = N0.getOpcode();
5315    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
5316        Opcode != ISD::FADD && Opcode != ISD::FSUB)
5317      return SDValue();
5318    std::swap(N0, N1);
5319  }
5320
5321  EVT VT = N->getValueType(0);
5322  DebugLoc DL = N->getDebugLoc();
5323  SDValue N00 = N0->getOperand(0);
5324  SDValue N01 = N0->getOperand(1);
5325  return DAG.getNode(Opcode, DL, VT,
5326                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
5327                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
5328}
5329
5330static SDValue PerformMULCombine(SDNode *N,
5331                                 TargetLowering::DAGCombinerInfo &DCI,
5332                                 const ARMSubtarget *Subtarget) {
5333  SelectionDAG &DAG = DCI.DAG;
5334
5335  if (Subtarget->isThumb1Only())
5336    return SDValue();
5337
5338  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
5339    return SDValue();
5340
5341  EVT VT = N->getValueType(0);
5342  if (VT.is64BitVector() || VT.is128BitVector())
5343    return PerformVMULCombine(N, DCI, Subtarget);
5344  if (VT != MVT::i32)
5345    return SDValue();
5346
5347  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
5348  if (!C)
5349    return SDValue();
5350
5351  uint64_t MulAmt = C->getZExtValue();
5352  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
5353  ShiftAmt = ShiftAmt & (32 - 1);
5354  SDValue V = N->getOperand(0);
5355  DebugLoc DL = N->getDebugLoc();
5356
5357  SDValue Res;
5358  MulAmt >>= ShiftAmt;
5359  if (isPowerOf2_32(MulAmt - 1)) {
5360    // (mul x, 2^N + 1) => (add (shl x, N), x)
5361    Res = DAG.getNode(ISD::ADD, DL, VT,
5362                      V, DAG.getNode(ISD::SHL, DL, VT,
5363                                     V, DAG.getConstant(Log2_32(MulAmt-1),
5364                                                        MVT::i32)));
5365  } else if (isPowerOf2_32(MulAmt + 1)) {
5366    // (mul x, 2^N - 1) => (sub (shl x, N), x)
5367    Res = DAG.getNode(ISD::SUB, DL, VT,
5368                      DAG.getNode(ISD::SHL, DL, VT,
5369                                  V, DAG.getConstant(Log2_32(MulAmt+1),
5370                                                     MVT::i32)),
5371                                                     V);
5372  } else
5373    return SDValue();
5374
5375  if (ShiftAmt != 0)
5376    Res = DAG.getNode(ISD::SHL, DL, VT, Res,
5377                      DAG.getConstant(ShiftAmt, MVT::i32));
5378
5379  // Do not add new nodes to DAG combiner worklist.
5380  DCI.CombineTo(N, Res, false);
5381  return SDValue();
5382}
5383
5384static SDValue PerformANDCombine(SDNode *N,
5385                                TargetLowering::DAGCombinerInfo &DCI) {
5386
5387  // Attempt to use immediate-form VBIC
5388  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
5389  DebugLoc dl = N->getDebugLoc();
5390  EVT VT = N->getValueType(0);
5391  SelectionDAG &DAG = DCI.DAG;
5392
5393  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
5394    return SDValue();
5395
5396  APInt SplatBits, SplatUndef;
5397  unsigned SplatBitSize;
5398  bool HasAnyUndefs;
5399  if (BVN &&
5400      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
5401    if (SplatBitSize <= 64) {
5402      EVT VbicVT;
5403      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
5404                                      SplatUndef.getZExtValue(), SplatBitSize,
5405                                      DAG, VbicVT, VT.is128BitVector(),
5406                                      OtherModImm);
5407      if (Val.getNode()) {
5408        SDValue Input =
5409          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
5410        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
5411        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
5412      }
5413    }
5414  }
5415
5416  return SDValue();
5417}
5418
5419/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
5420static SDValue PerformORCombine(SDNode *N,
5421                                TargetLowering::DAGCombinerInfo &DCI,
5422                                const ARMSubtarget *Subtarget) {
5423  // Attempt to use immediate-form VORR
5424  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
5425  DebugLoc dl = N->getDebugLoc();
5426  EVT VT = N->getValueType(0);
5427  SelectionDAG &DAG = DCI.DAG;
5428
5429  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
5430    return SDValue();
5431
5432  APInt SplatBits, SplatUndef;
5433  unsigned SplatBitSize;
5434  bool HasAnyUndefs;
5435  if (BVN && Subtarget->hasNEON() &&
5436      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
5437    if (SplatBitSize <= 64) {
5438      EVT VorrVT;
5439      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
5440                                      SplatUndef.getZExtValue(), SplatBitSize,
5441                                      DAG, VorrVT, VT.is128BitVector(),
5442                                      OtherModImm);
5443      if (Val.getNode()) {
5444        SDValue Input =
5445          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
5446        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
5447        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
5448      }
5449    }
5450  }
5451
5452  SDValue N0 = N->getOperand(0);
5453  if (N0.getOpcode() != ISD::AND)
5454    return SDValue();
5455  SDValue N1 = N->getOperand(1);
5456
5457  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
5458  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
5459      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
5460    APInt SplatUndef;
5461    unsigned SplatBitSize;
5462    bool HasAnyUndefs;
5463
5464    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
5465    APInt SplatBits0;
5466    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
5467                                  HasAnyUndefs) && !HasAnyUndefs) {
5468      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
5469      APInt SplatBits1;
5470      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
5471                                    HasAnyUndefs) && !HasAnyUndefs &&
5472          SplatBits0 == ~SplatBits1) {
5473        // Canonicalize the vector type to make instruction selection simpler.
5474        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
5475        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
5476                                     N0->getOperand(1), N0->getOperand(0),
5477                                     N1->getOperand(0));
5478        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
5479      }
5480    }
5481  }
5482
5483  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
5484  // reasonable.
5485
5486  // BFI is only available on V6T2+
5487  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
5488    return SDValue();
5489
5490  DebugLoc DL = N->getDebugLoc();
5491  // 1) or (and A, mask), val => ARMbfi A, val, mask
5492  //      iff (val & mask) == val
5493  //
5494  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
5495  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
5496  //          && mask == ~mask2
5497  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
5498  //          && ~mask == mask2
5499  //  (i.e., copy a bitfield value into another bitfield of the same width)
5500
5501  if (VT != MVT::i32)
5502    return SDValue();
5503
5504  SDValue N00 = N0.getOperand(0);
5505
5506  // The value and the mask need to be constants so we can verify this is
5507  // actually a bitfield set. If the mask is 0xffff, we can do better
5508  // via a movt instruction, so don't use BFI in that case.
5509  SDValue MaskOp = N0.getOperand(1);
5510  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
5511  if (!MaskC)
5512    return SDValue();
5513  unsigned Mask = MaskC->getZExtValue();
5514  if (Mask == 0xffff)
5515    return SDValue();
5516  SDValue Res;
5517  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
5518  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
5519  if (N1C) {
5520    unsigned Val = N1C->getZExtValue();
5521    if ((Val & ~Mask) != Val)
5522      return SDValue();
5523
5524    if (ARM::isBitFieldInvertedMask(Mask)) {
5525      Val >>= CountTrailingZeros_32(~Mask);
5526
5527      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
5528                        DAG.getConstant(Val, MVT::i32),
5529                        DAG.getConstant(Mask, MVT::i32));
5530
5531      // Do not add new nodes to DAG combiner worklist.
5532      DCI.CombineTo(N, Res, false);
5533      return SDValue();
5534    }
5535  } else if (N1.getOpcode() == ISD::AND) {
5536    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
5537    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
5538    if (!N11C)
5539      return SDValue();
5540    unsigned Mask2 = N11C->getZExtValue();
5541
5542    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
5543    // as is to match.
5544    if (ARM::isBitFieldInvertedMask(Mask) &&
5545        (Mask == ~Mask2)) {
5546      // The pack halfword instruction works better for masks that fit it,
5547      // so use that when it's available.
5548      if (Subtarget->hasT2ExtractPack() &&
5549          (Mask == 0xffff || Mask == 0xffff0000))
5550        return SDValue();
5551      // 2a
5552      unsigned amt = CountTrailingZeros_32(Mask2);
5553      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
5554                        DAG.getConstant(amt, MVT::i32));
5555      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
5556                        DAG.getConstant(Mask, MVT::i32));
5557      // Do not add new nodes to DAG combiner worklist.
5558      DCI.CombineTo(N, Res, false);
5559      return SDValue();
5560    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
5561               (~Mask == Mask2)) {
5562      // The pack halfword instruction works better for masks that fit it,
5563      // so use that when it's available.
5564      if (Subtarget->hasT2ExtractPack() &&
5565          (Mask2 == 0xffff || Mask2 == 0xffff0000))
5566        return SDValue();
5567      // 2b
5568      unsigned lsb = CountTrailingZeros_32(Mask);
5569      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
5570                        DAG.getConstant(lsb, MVT::i32));
5571      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
5572                        DAG.getConstant(Mask2, MVT::i32));
5573      // Do not add new nodes to DAG combiner worklist.
5574      DCI.CombineTo(N, Res, false);
5575      return SDValue();
5576    }
5577  }
5578
5579  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
5580      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
5581      ARM::isBitFieldInvertedMask(~Mask)) {
5582    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
5583    // where lsb(mask) == #shamt and masked bits of B are known zero.
5584    SDValue ShAmt = N00.getOperand(1);
5585    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
5586    unsigned LSB = CountTrailingZeros_32(Mask);
5587    if (ShAmtC != LSB)
5588      return SDValue();
5589
5590    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
5591                      DAG.getConstant(~Mask, MVT::i32));
5592
5593    // Do not add new nodes to DAG combiner worklist.
5594    DCI.CombineTo(N, Res, false);
5595  }
5596
5597  return SDValue();
5598}
5599
5600/// PerformBFICombine - (bfi A, (and B, C1), C2) -> (bfi A, B, C2) iff
5601/// C1 & C2 == C1.
5602static SDValue PerformBFICombine(SDNode *N,
5603                                 TargetLowering::DAGCombinerInfo &DCI) {
5604  SDValue N1 = N->getOperand(1);
5605  if (N1.getOpcode() == ISD::AND) {
5606    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
5607    if (!N11C)
5608      return SDValue();
5609    unsigned Mask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
5610    unsigned Mask2 = N11C->getZExtValue();
5611    if ((Mask & Mask2) == Mask2)
5612      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
5613                             N->getOperand(0), N1.getOperand(0),
5614                             N->getOperand(2));
5615  }
5616  return SDValue();
5617}
5618
5619/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
5620/// ARMISD::VMOVRRD.
5621static SDValue PerformVMOVRRDCombine(SDNode *N,
5622                                     TargetLowering::DAGCombinerInfo &DCI) {
5623  // vmovrrd(vmovdrr x, y) -> x,y
5624  SDValue InDouble = N->getOperand(0);
5625  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
5626    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
5627
5628  // vmovrrd(load f64) -> (load i32), (load i32)
5629  SDNode *InNode = InDouble.getNode();
5630  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
5631      InNode->getValueType(0) == MVT::f64 &&
5632      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
5633      !cast<LoadSDNode>(InNode)->isVolatile()) {
5634    // TODO: Should this be done for non-FrameIndex operands?
5635    LoadSDNode *LD = cast<LoadSDNode>(InNode);
5636
5637    SelectionDAG &DAG = DCI.DAG;
5638    DebugLoc DL = LD->getDebugLoc();
5639    SDValue BasePtr = LD->getBasePtr();
5640    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
5641                                 LD->getPointerInfo(), LD->isVolatile(),
5642                                 LD->isNonTemporal(), LD->getAlignment());
5643
5644    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
5645                                    DAG.getConstant(4, MVT::i32));
5646    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
5647                                 LD->getPointerInfo(), LD->isVolatile(),
5648                                 LD->isNonTemporal(),
5649                                 std::min(4U, LD->getAlignment() / 2));
5650
5651    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
5652    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
5653    DCI.RemoveFromWorklist(LD);
5654    DAG.DeleteNode(LD);
5655    return Result;
5656  }
5657
5658  return SDValue();
5659}
5660
5661/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
5662/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
5663static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
5664  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
5665  SDValue Op0 = N->getOperand(0);
5666  SDValue Op1 = N->getOperand(1);
5667  if (Op0.getOpcode() == ISD::BITCAST)
5668    Op0 = Op0.getOperand(0);
5669  if (Op1.getOpcode() == ISD::BITCAST)
5670    Op1 = Op1.getOperand(0);
5671  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
5672      Op0.getNode() == Op1.getNode() &&
5673      Op0.getResNo() == 0 && Op1.getResNo() == 1)
5674    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
5675                       N->getValueType(0), Op0.getOperand(0));
5676  return SDValue();
5677}
5678
5679/// PerformSTORECombine - Target-specific dag combine xforms for
5680/// ISD::STORE.
5681static SDValue PerformSTORECombine(SDNode *N,
5682                                   TargetLowering::DAGCombinerInfo &DCI) {
5683  // Bitcast an i64 store extracted from a vector to f64.
5684  // Otherwise, the i64 value will be legalized to a pair of i32 values.
5685  StoreSDNode *St = cast<StoreSDNode>(N);
5686  SDValue StVal = St->getValue();
5687  if (!ISD::isNormalStore(St) || St->isVolatile())
5688    return SDValue();
5689
5690  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
5691      StVal.getNode()->hasOneUse() && !St->isVolatile()) {
5692    SelectionDAG  &DAG = DCI.DAG;
5693    DebugLoc DL = St->getDebugLoc();
5694    SDValue BasePtr = St->getBasePtr();
5695    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
5696                                  StVal.getNode()->getOperand(0), BasePtr,
5697                                  St->getPointerInfo(), St->isVolatile(),
5698                                  St->isNonTemporal(), St->getAlignment());
5699
5700    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
5701                                    DAG.getConstant(4, MVT::i32));
5702    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
5703                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
5704                        St->isNonTemporal(),
5705                        std::min(4U, St->getAlignment() / 2));
5706  }
5707
5708  if (StVal.getValueType() != MVT::i64 ||
5709      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5710    return SDValue();
5711
5712  SelectionDAG &DAG = DCI.DAG;
5713  DebugLoc dl = StVal.getDebugLoc();
5714  SDValue IntVec = StVal.getOperand(0);
5715  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
5716                                 IntVec.getValueType().getVectorNumElements());
5717  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
5718  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5719                               Vec, StVal.getOperand(1));
5720  dl = N->getDebugLoc();
5721  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
5722  // Make the DAGCombiner fold the bitcasts.
5723  DCI.AddToWorklist(Vec.getNode());
5724  DCI.AddToWorklist(ExtElt.getNode());
5725  DCI.AddToWorklist(V.getNode());
5726  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
5727                      St->getPointerInfo(), St->isVolatile(),
5728                      St->isNonTemporal(), St->getAlignment(),
5729                      St->getTBAAInfo());
5730}
5731
5732/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
5733/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
5734/// i64 vector to have f64 elements, since the value can then be loaded
5735/// directly into a VFP register.
5736static bool hasNormalLoadOperand(SDNode *N) {
5737  unsigned NumElts = N->getValueType(0).getVectorNumElements();
5738  for (unsigned i = 0; i < NumElts; ++i) {
5739    SDNode *Elt = N->getOperand(i).getNode();
5740    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
5741      return true;
5742  }
5743  return false;
5744}
5745
5746/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
5747/// ISD::BUILD_VECTOR.
5748static SDValue PerformBUILD_VECTORCombine(SDNode *N,
5749                                          TargetLowering::DAGCombinerInfo &DCI){
5750  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
5751  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
5752  // into a pair of GPRs, which is fine when the value is used as a scalar,
5753  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
5754  SelectionDAG &DAG = DCI.DAG;
5755  if (N->getNumOperands() == 2) {
5756    SDValue RV = PerformVMOVDRRCombine(N, DAG);
5757    if (RV.getNode())
5758      return RV;
5759  }
5760
5761  // Load i64 elements as f64 values so that type legalization does not split
5762  // them up into i32 values.
5763  EVT VT = N->getValueType(0);
5764  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
5765    return SDValue();
5766  DebugLoc dl = N->getDebugLoc();
5767  SmallVector<SDValue, 8> Ops;
5768  unsigned NumElts = VT.getVectorNumElements();
5769  for (unsigned i = 0; i < NumElts; ++i) {
5770    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
5771    Ops.push_back(V);
5772    // Make the DAGCombiner fold the bitcast.
5773    DCI.AddToWorklist(V.getNode());
5774  }
5775  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
5776  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
5777  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
5778}
5779
5780/// PerformInsertEltCombine - Target-specific dag combine xforms for
5781/// ISD::INSERT_VECTOR_ELT.
5782static SDValue PerformInsertEltCombine(SDNode *N,
5783                                       TargetLowering::DAGCombinerInfo &DCI) {
5784  // Bitcast an i64 load inserted into a vector to f64.
5785  // Otherwise, the i64 value will be legalized to a pair of i32 values.
5786  EVT VT = N->getValueType(0);
5787  SDNode *Elt = N->getOperand(1).getNode();
5788  if (VT.getVectorElementType() != MVT::i64 ||
5789      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
5790    return SDValue();
5791
5792  SelectionDAG &DAG = DCI.DAG;
5793  DebugLoc dl = N->getDebugLoc();
5794  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
5795                                 VT.getVectorNumElements());
5796  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
5797  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
5798  // Make the DAGCombiner fold the bitcasts.
5799  DCI.AddToWorklist(Vec.getNode());
5800  DCI.AddToWorklist(V.getNode());
5801  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
5802                               Vec, V, N->getOperand(2));
5803  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
5804}
5805
5806/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
5807/// ISD::VECTOR_SHUFFLE.
5808static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
5809  // The LLVM shufflevector instruction does not require the shuffle mask
5810  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
5811  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
5812  // operands do not match the mask length, they are extended by concatenating
5813  // them with undef vectors.  That is probably the right thing for other
5814  // targets, but for NEON it is better to concatenate two double-register
5815  // size vector operands into a single quad-register size vector.  Do that
5816  // transformation here:
5817  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
5818  //   shuffle(concat(v1, v2), undef)
5819  SDValue Op0 = N->getOperand(0);
5820  SDValue Op1 = N->getOperand(1);
5821  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
5822      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
5823      Op0.getNumOperands() != 2 ||
5824      Op1.getNumOperands() != 2)
5825    return SDValue();
5826  SDValue Concat0Op1 = Op0.getOperand(1);
5827  SDValue Concat1Op1 = Op1.getOperand(1);
5828  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
5829      Concat1Op1.getOpcode() != ISD::UNDEF)
5830    return SDValue();
5831  // Skip the transformation if any of the types are illegal.
5832  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5833  EVT VT = N->getValueType(0);
5834  if (!TLI.isTypeLegal(VT) ||
5835      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
5836      !TLI.isTypeLegal(Concat1Op1.getValueType()))
5837    return SDValue();
5838
5839  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
5840                                  Op0.getOperand(0), Op1.getOperand(0));
5841  // Translate the shuffle mask.
5842  SmallVector<int, 16> NewMask;
5843  unsigned NumElts = VT.getVectorNumElements();
5844  unsigned HalfElts = NumElts/2;
5845  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
5846  for (unsigned n = 0; n < NumElts; ++n) {
5847    int MaskElt = SVN->getMaskElt(n);
5848    int NewElt = -1;
5849    if (MaskElt < (int)HalfElts)
5850      NewElt = MaskElt;
5851    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
5852      NewElt = HalfElts + MaskElt - NumElts;
5853    NewMask.push_back(NewElt);
5854  }
5855  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
5856                              DAG.getUNDEF(VT), NewMask.data());
5857}
5858
5859/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
5860/// NEON load/store intrinsics to merge base address updates.
5861static SDValue CombineBaseUpdate(SDNode *N,
5862                                 TargetLowering::DAGCombinerInfo &DCI) {
5863  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
5864    return SDValue();
5865
5866  SelectionDAG &DAG = DCI.DAG;
5867  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
5868                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
5869  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
5870  SDValue Addr = N->getOperand(AddrOpIdx);
5871
5872  // Search for a use of the address operand that is an increment.
5873  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
5874         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
5875    SDNode *User = *UI;
5876    if (User->getOpcode() != ISD::ADD ||
5877        UI.getUse().getResNo() != Addr.getResNo())
5878      continue;
5879
5880    // Check that the add is independent of the load/store.  Otherwise, folding
5881    // it would create a cycle.
5882    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
5883      continue;
5884
5885    // Find the new opcode for the updating load/store.
5886    bool isLoad = true;
5887    bool isLaneOp = false;
5888    unsigned NewOpc = 0;
5889    unsigned NumVecs = 0;
5890    if (isIntrinsic) {
5891      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
5892      switch (IntNo) {
5893      default: assert(0 && "unexpected intrinsic for Neon base update");
5894      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
5895        NumVecs = 1; break;
5896      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
5897        NumVecs = 2; break;
5898      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
5899        NumVecs = 3; break;
5900      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
5901        NumVecs = 4; break;
5902      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
5903        NumVecs = 2; isLaneOp = true; break;
5904      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
5905        NumVecs = 3; isLaneOp = true; break;
5906      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
5907        NumVecs = 4; isLaneOp = true; break;
5908      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
5909        NumVecs = 1; isLoad = false; break;
5910      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
5911        NumVecs = 2; isLoad = false; break;
5912      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
5913        NumVecs = 3; isLoad = false; break;
5914      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
5915        NumVecs = 4; isLoad = false; break;
5916      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
5917        NumVecs = 2; isLoad = false; isLaneOp = true; break;
5918      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
5919        NumVecs = 3; isLoad = false; isLaneOp = true; break;
5920      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
5921        NumVecs = 4; isLoad = false; isLaneOp = true; break;
5922      }
5923    } else {
5924      isLaneOp = true;
5925      switch (N->getOpcode()) {
5926      default: assert(0 && "unexpected opcode for Neon base update");
5927      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
5928      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
5929      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
5930      }
5931    }
5932
5933    // Find the size of memory referenced by the load/store.
5934    EVT VecTy;
5935    if (isLoad)
5936      VecTy = N->getValueType(0);
5937    else
5938      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
5939    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
5940    if (isLaneOp)
5941      NumBytes /= VecTy.getVectorNumElements();
5942
5943    // If the increment is a constant, it must match the memory ref size.
5944    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
5945    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
5946      uint64_t IncVal = CInc->getZExtValue();
5947      if (IncVal != NumBytes)
5948        continue;
5949    } else if (NumBytes >= 3 * 16) {
5950      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
5951      // separate instructions that make it harder to use a non-constant update.
5952      continue;
5953    }
5954
5955    // Create the new updating load/store node.
5956    EVT Tys[6];
5957    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
5958    unsigned n;
5959    for (n = 0; n < NumResultVecs; ++n)
5960      Tys[n] = VecTy;
5961    Tys[n++] = MVT::i32;
5962    Tys[n] = MVT::Other;
5963    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
5964    SmallVector<SDValue, 8> Ops;
5965    Ops.push_back(N->getOperand(0)); // incoming chain
5966    Ops.push_back(N->getOperand(AddrOpIdx));
5967    Ops.push_back(Inc);
5968    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
5969      Ops.push_back(N->getOperand(i));
5970    }
5971    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
5972    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
5973                                           Ops.data(), Ops.size(),
5974                                           MemInt->getMemoryVT(),
5975                                           MemInt->getMemOperand());
5976
5977    // Update the uses.
5978    std::vector<SDValue> NewResults;
5979    for (unsigned i = 0; i < NumResultVecs; ++i) {
5980      NewResults.push_back(SDValue(UpdN.getNode(), i));
5981    }
5982    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
5983    DCI.CombineTo(N, NewResults);
5984    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
5985
5986    break;
5987  }
5988  return SDValue();
5989}
5990
5991/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
5992/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
5993/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
5994/// return true.
5995static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
5996  SelectionDAG &DAG = DCI.DAG;
5997  EVT VT = N->getValueType(0);
5998  // vldN-dup instructions only support 64-bit vectors for N > 1.
5999  if (!VT.is64BitVector())
6000    return false;
6001
6002  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
6003  SDNode *VLD = N->getOperand(0).getNode();
6004  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
6005    return false;
6006  unsigned NumVecs = 0;
6007  unsigned NewOpc = 0;
6008  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
6009  if (IntNo == Intrinsic::arm_neon_vld2lane) {
6010    NumVecs = 2;
6011    NewOpc = ARMISD::VLD2DUP;
6012  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
6013    NumVecs = 3;
6014    NewOpc = ARMISD::VLD3DUP;
6015  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
6016    NumVecs = 4;
6017    NewOpc = ARMISD::VLD4DUP;
6018  } else {
6019    return false;
6020  }
6021
6022  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
6023  // numbers match the load.
6024  unsigned VLDLaneNo =
6025    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
6026  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
6027       UI != UE; ++UI) {
6028    // Ignore uses of the chain result.
6029    if (UI.getUse().getResNo() == NumVecs)
6030      continue;
6031    SDNode *User = *UI;
6032    if (User->getOpcode() != ARMISD::VDUPLANE ||
6033        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
6034      return false;
6035  }
6036
6037  // Create the vldN-dup node.
6038  EVT Tys[5];
6039  unsigned n;
6040  for (n = 0; n < NumVecs; ++n)
6041    Tys[n] = VT;
6042  Tys[n] = MVT::Other;
6043  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
6044  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
6045  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
6046  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
6047                                           Ops, 2, VLDMemInt->getMemoryVT(),
6048                                           VLDMemInt->getMemOperand());
6049
6050  // Update the uses.
6051  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
6052       UI != UE; ++UI) {
6053    unsigned ResNo = UI.getUse().getResNo();
6054    // Ignore uses of the chain result.
6055    if (ResNo == NumVecs)
6056      continue;
6057    SDNode *User = *UI;
6058    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
6059  }
6060
6061  // Now the vldN-lane intrinsic is dead except for its chain result.
6062  // Update uses of the chain.
6063  std::vector<SDValue> VLDDupResults;
6064  for (unsigned n = 0; n < NumVecs; ++n)
6065    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
6066  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
6067  DCI.CombineTo(VLD, VLDDupResults);
6068
6069  return true;
6070}
6071
6072/// PerformVDUPLANECombine - Target-specific dag combine xforms for
6073/// ARMISD::VDUPLANE.
6074static SDValue PerformVDUPLANECombine(SDNode *N,
6075                                      TargetLowering::DAGCombinerInfo &DCI) {
6076  SDValue Op = N->getOperand(0);
6077
6078  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
6079  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
6080  if (CombineVLDDUP(N, DCI))
6081    return SDValue(N, 0);
6082
6083  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
6084  // redundant.  Ignore bit_converts for now; element sizes are checked below.
6085  while (Op.getOpcode() == ISD::BITCAST)
6086    Op = Op.getOperand(0);
6087  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
6088    return SDValue();
6089
6090  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
6091  unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
6092  // The canonical VMOV for a zero vector uses a 32-bit element size.
6093  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6094  unsigned EltBits;
6095  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
6096    EltSize = 8;
6097  EVT VT = N->getValueType(0);
6098  if (EltSize > VT.getVectorElementType().getSizeInBits())
6099    return SDValue();
6100
6101  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
6102}
6103
6104/// getVShiftImm - Check if this is a valid build_vector for the immediate
6105/// operand of a vector shift operation, where all the elements of the
6106/// build_vector must have the same constant integer value.
6107static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6108  // Ignore bit_converts.
6109  while (Op.getOpcode() == ISD::BITCAST)
6110    Op = Op.getOperand(0);
6111  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
6112  APInt SplatBits, SplatUndef;
6113  unsigned SplatBitSize;
6114  bool HasAnyUndefs;
6115  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
6116                                      HasAnyUndefs, ElementBits) ||
6117      SplatBitSize > ElementBits)
6118    return false;
6119  Cnt = SplatBits.getSExtValue();
6120  return true;
6121}
6122
6123/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6124/// operand of a vector shift left operation.  That value must be in the range:
6125///   0 <= Value < ElementBits for a left shift; or
6126///   0 <= Value <= ElementBits for a long left shift.
6127static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6128  assert(VT.isVector() && "vector shift count is not a vector type");
6129  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
6130  if (! getVShiftImm(Op, ElementBits, Cnt))
6131    return false;
6132  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
6133}
6134
6135/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6136/// operand of a vector shift right operation.  For a shift opcode, the value
6137/// is positive, but for an intrinsic the value count must be negative. The
6138/// absolute value must be in the range:
6139///   1 <= |Value| <= ElementBits for a right shift; or
6140///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
6141static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6142                         int64_t &Cnt) {
6143  assert(VT.isVector() && "vector shift count is not a vector type");
6144  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
6145  if (! getVShiftImm(Op, ElementBits, Cnt))
6146    return false;
6147  if (isIntrinsic)
6148    Cnt = -Cnt;
6149  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
6150}
6151
6152/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
6153static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
6154  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
6155  switch (IntNo) {
6156  default:
6157    // Don't do anything for most intrinsics.
6158    break;
6159
6160  // Vector shifts: check for immediate versions and lower them.
6161  // Note: This is done during DAG combining instead of DAG legalizing because
6162  // the build_vectors for 64-bit vector element shift counts are generally
6163  // not legal, and it is hard to see their values after they get legalized to
6164  // loads from a constant pool.
6165  case Intrinsic::arm_neon_vshifts:
6166  case Intrinsic::arm_neon_vshiftu:
6167  case Intrinsic::arm_neon_vshiftls:
6168  case Intrinsic::arm_neon_vshiftlu:
6169  case Intrinsic::arm_neon_vshiftn:
6170  case Intrinsic::arm_neon_vrshifts:
6171  case Intrinsic::arm_neon_vrshiftu:
6172  case Intrinsic::arm_neon_vrshiftn:
6173  case Intrinsic::arm_neon_vqshifts:
6174  case Intrinsic::arm_neon_vqshiftu:
6175  case Intrinsic::arm_neon_vqshiftsu:
6176  case Intrinsic::arm_neon_vqshiftns:
6177  case Intrinsic::arm_neon_vqshiftnu:
6178  case Intrinsic::arm_neon_vqshiftnsu:
6179  case Intrinsic::arm_neon_vqrshiftns:
6180  case Intrinsic::arm_neon_vqrshiftnu:
6181  case Intrinsic::arm_neon_vqrshiftnsu: {
6182    EVT VT = N->getOperand(1).getValueType();
6183    int64_t Cnt;
6184    unsigned VShiftOpc = 0;
6185
6186    switch (IntNo) {
6187    case Intrinsic::arm_neon_vshifts:
6188    case Intrinsic::arm_neon_vshiftu:
6189      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
6190        VShiftOpc = ARMISD::VSHL;
6191        break;
6192      }
6193      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
6194        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
6195                     ARMISD::VSHRs : ARMISD::VSHRu);
6196        break;
6197      }
6198      return SDValue();
6199
6200    case Intrinsic::arm_neon_vshiftls:
6201    case Intrinsic::arm_neon_vshiftlu:
6202      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
6203        break;
6204      llvm_unreachable("invalid shift count for vshll intrinsic");
6205
6206    case Intrinsic::arm_neon_vrshifts:
6207    case Intrinsic::arm_neon_vrshiftu:
6208      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
6209        break;
6210      return SDValue();
6211
6212    case Intrinsic::arm_neon_vqshifts:
6213    case Intrinsic::arm_neon_vqshiftu:
6214      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
6215        break;
6216      return SDValue();
6217
6218    case Intrinsic::arm_neon_vqshiftsu:
6219      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
6220        break;
6221      llvm_unreachable("invalid shift count for vqshlu intrinsic");
6222
6223    case Intrinsic::arm_neon_vshiftn:
6224    case Intrinsic::arm_neon_vrshiftn:
6225    case Intrinsic::arm_neon_vqshiftns:
6226    case Intrinsic::arm_neon_vqshiftnu:
6227    case Intrinsic::arm_neon_vqshiftnsu:
6228    case Intrinsic::arm_neon_vqrshiftns:
6229    case Intrinsic::arm_neon_vqrshiftnu:
6230    case Intrinsic::arm_neon_vqrshiftnsu:
6231      // Narrowing shifts require an immediate right shift.
6232      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
6233        break;
6234      llvm_unreachable("invalid shift count for narrowing vector shift "
6235                       "intrinsic");
6236
6237    default:
6238      llvm_unreachable("unhandled vector shift");
6239    }
6240
6241    switch (IntNo) {
6242    case Intrinsic::arm_neon_vshifts:
6243    case Intrinsic::arm_neon_vshiftu:
6244      // Opcode already set above.
6245      break;
6246    case Intrinsic::arm_neon_vshiftls:
6247    case Intrinsic::arm_neon_vshiftlu:
6248      if (Cnt == VT.getVectorElementType().getSizeInBits())
6249        VShiftOpc = ARMISD::VSHLLi;
6250      else
6251        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
6252                     ARMISD::VSHLLs : ARMISD::VSHLLu);
6253      break;
6254    case Intrinsic::arm_neon_vshiftn:
6255      VShiftOpc = ARMISD::VSHRN; break;
6256    case Intrinsic::arm_neon_vrshifts:
6257      VShiftOpc = ARMISD::VRSHRs; break;
6258    case Intrinsic::arm_neon_vrshiftu:
6259      VShiftOpc = ARMISD::VRSHRu; break;
6260    case Intrinsic::arm_neon_vrshiftn:
6261      VShiftOpc = ARMISD::VRSHRN; break;
6262    case Intrinsic::arm_neon_vqshifts:
6263      VShiftOpc = ARMISD::VQSHLs; break;
6264    case Intrinsic::arm_neon_vqshiftu:
6265      VShiftOpc = ARMISD::VQSHLu; break;
6266    case Intrinsic::arm_neon_vqshiftsu:
6267      VShiftOpc = ARMISD::VQSHLsu; break;
6268    case Intrinsic::arm_neon_vqshiftns:
6269      VShiftOpc = ARMISD::VQSHRNs; break;
6270    case Intrinsic::arm_neon_vqshiftnu:
6271      VShiftOpc = ARMISD::VQSHRNu; break;
6272    case Intrinsic::arm_neon_vqshiftnsu:
6273      VShiftOpc = ARMISD::VQSHRNsu; break;
6274    case Intrinsic::arm_neon_vqrshiftns:
6275      VShiftOpc = ARMISD::VQRSHRNs; break;
6276    case Intrinsic::arm_neon_vqrshiftnu:
6277      VShiftOpc = ARMISD::VQRSHRNu; break;
6278    case Intrinsic::arm_neon_vqrshiftnsu:
6279      VShiftOpc = ARMISD::VQRSHRNsu; break;
6280    }
6281
6282    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
6283                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
6284  }
6285
6286  case Intrinsic::arm_neon_vshiftins: {
6287    EVT VT = N->getOperand(1).getValueType();
6288    int64_t Cnt;
6289    unsigned VShiftOpc = 0;
6290
6291    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
6292      VShiftOpc = ARMISD::VSLI;
6293    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
6294      VShiftOpc = ARMISD::VSRI;
6295    else {
6296      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
6297    }
6298
6299    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
6300                       N->getOperand(1), N->getOperand(2),
6301                       DAG.getConstant(Cnt, MVT::i32));
6302  }
6303
6304  case Intrinsic::arm_neon_vqrshifts:
6305  case Intrinsic::arm_neon_vqrshiftu:
6306    // No immediate versions of these to check for.
6307    break;
6308  }
6309
6310  return SDValue();
6311}
6312
6313/// PerformShiftCombine - Checks for immediate versions of vector shifts and
6314/// lowers them.  As with the vector shift intrinsics, this is done during DAG
6315/// combining instead of DAG legalizing because the build_vectors for 64-bit
6316/// vector element shift counts are generally not legal, and it is hard to see
6317/// their values after they get legalized to loads from a constant pool.
6318static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
6319                                   const ARMSubtarget *ST) {
6320  EVT VT = N->getValueType(0);
6321
6322  // Nothing to be done for scalar shifts.
6323  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6324  if (!VT.isVector() || !TLI.isTypeLegal(VT))
6325    return SDValue();
6326
6327  assert(ST->hasNEON() && "unexpected vector shift");
6328  int64_t Cnt;
6329
6330  switch (N->getOpcode()) {
6331  default: llvm_unreachable("unexpected shift opcode");
6332
6333  case ISD::SHL:
6334    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6335      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
6336                         DAG.getConstant(Cnt, MVT::i32));
6337    break;
6338
6339  case ISD::SRA:
6340  case ISD::SRL:
6341    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6342      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
6343                            ARMISD::VSHRs : ARMISD::VSHRu);
6344      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
6345                         DAG.getConstant(Cnt, MVT::i32));
6346    }
6347  }
6348  return SDValue();
6349}
6350
6351/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
6352/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
6353static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
6354                                    const ARMSubtarget *ST) {
6355  SDValue N0 = N->getOperand(0);
6356
6357  // Check for sign- and zero-extensions of vector extract operations of 8-
6358  // and 16-bit vector elements.  NEON supports these directly.  They are
6359  // handled during DAG combining because type legalization will promote them
6360  // to 32-bit types and it is messy to recognize the operations after that.
6361  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
6362    SDValue Vec = N0.getOperand(0);
6363    SDValue Lane = N0.getOperand(1);
6364    EVT VT = N->getValueType(0);
6365    EVT EltVT = N0.getValueType();
6366    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6367
6368    if (VT == MVT::i32 &&
6369        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
6370        TLI.isTypeLegal(Vec.getValueType()) &&
6371        isa<ConstantSDNode>(Lane)) {
6372
6373      unsigned Opc = 0;
6374      switch (N->getOpcode()) {
6375      default: llvm_unreachable("unexpected opcode");
6376      case ISD::SIGN_EXTEND:
6377        Opc = ARMISD::VGETLANEs;
6378        break;
6379      case ISD::ZERO_EXTEND:
6380      case ISD::ANY_EXTEND:
6381        Opc = ARMISD::VGETLANEu;
6382        break;
6383      }
6384      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
6385    }
6386  }
6387
6388  return SDValue();
6389}
6390
6391/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
6392/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
6393static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
6394                                       const ARMSubtarget *ST) {
6395  // If the target supports NEON, try to use vmax/vmin instructions for f32
6396  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
6397  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
6398  // a NaN; only do the transformation when it matches that behavior.
6399
6400  // For now only do this when using NEON for FP operations; if using VFP, it
6401  // is not obvious that the benefit outweighs the cost of switching to the
6402  // NEON pipeline.
6403  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
6404      N->getValueType(0) != MVT::f32)
6405    return SDValue();
6406
6407  SDValue CondLHS = N->getOperand(0);
6408  SDValue CondRHS = N->getOperand(1);
6409  SDValue LHS = N->getOperand(2);
6410  SDValue RHS = N->getOperand(3);
6411  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
6412
6413  unsigned Opcode = 0;
6414  bool IsReversed;
6415  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
6416    IsReversed = false; // x CC y ? x : y
6417  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
6418    IsReversed = true ; // x CC y ? y : x
6419  } else {
6420    return SDValue();
6421  }
6422
6423  bool IsUnordered;
6424  switch (CC) {
6425  default: break;
6426  case ISD::SETOLT:
6427  case ISD::SETOLE:
6428  case ISD::SETLT:
6429  case ISD::SETLE:
6430  case ISD::SETULT:
6431  case ISD::SETULE:
6432    // If LHS is NaN, an ordered comparison will be false and the result will
6433    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
6434    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
6435    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
6436    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
6437      break;
6438    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
6439    // will return -0, so vmin can only be used for unsafe math or if one of
6440    // the operands is known to be nonzero.
6441    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
6442        !UnsafeFPMath &&
6443        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
6444      break;
6445    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
6446    break;
6447
6448  case ISD::SETOGT:
6449  case ISD::SETOGE:
6450  case ISD::SETGT:
6451  case ISD::SETGE:
6452  case ISD::SETUGT:
6453  case ISD::SETUGE:
6454    // If LHS is NaN, an ordered comparison will be false and the result will
6455    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
6456    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
6457    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
6458    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
6459      break;
6460    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
6461    // will return +0, so vmax can only be used for unsafe math or if one of
6462    // the operands is known to be nonzero.
6463    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
6464        !UnsafeFPMath &&
6465        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
6466      break;
6467    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
6468    break;
6469  }
6470
6471  if (!Opcode)
6472    return SDValue();
6473  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
6474}
6475
6476SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
6477                                             DAGCombinerInfo &DCI) const {
6478  switch (N->getOpcode()) {
6479  default: break;
6480  case ISD::ADD:        return PerformADDCombine(N, DCI);
6481  case ISD::SUB:        return PerformSUBCombine(N, DCI);
6482  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
6483  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
6484  case ISD::AND:        return PerformANDCombine(N, DCI);
6485  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
6486  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
6487  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
6488  case ISD::STORE:      return PerformSTORECombine(N, DCI);
6489  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
6490  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
6491  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
6492  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
6493  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
6494  case ISD::SHL:
6495  case ISD::SRA:
6496  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
6497  case ISD::SIGN_EXTEND:
6498  case ISD::ZERO_EXTEND:
6499  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
6500  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
6501  case ARMISD::VLD2DUP:
6502  case ARMISD::VLD3DUP:
6503  case ARMISD::VLD4DUP:
6504    return CombineBaseUpdate(N, DCI);
6505  case ISD::INTRINSIC_VOID:
6506  case ISD::INTRINSIC_W_CHAIN:
6507    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
6508    case Intrinsic::arm_neon_vld1:
6509    case Intrinsic::arm_neon_vld2:
6510    case Intrinsic::arm_neon_vld3:
6511    case Intrinsic::arm_neon_vld4:
6512    case Intrinsic::arm_neon_vld2lane:
6513    case Intrinsic::arm_neon_vld3lane:
6514    case Intrinsic::arm_neon_vld4lane:
6515    case Intrinsic::arm_neon_vst1:
6516    case Intrinsic::arm_neon_vst2:
6517    case Intrinsic::arm_neon_vst3:
6518    case Intrinsic::arm_neon_vst4:
6519    case Intrinsic::arm_neon_vst2lane:
6520    case Intrinsic::arm_neon_vst3lane:
6521    case Intrinsic::arm_neon_vst4lane:
6522      return CombineBaseUpdate(N, DCI);
6523    default: break;
6524    }
6525    break;
6526  }
6527  return SDValue();
6528}
6529
6530bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
6531                                                          EVT VT) const {
6532  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
6533}
6534
6535bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
6536  if (!Subtarget->allowsUnalignedMem())
6537    return false;
6538
6539  switch (VT.getSimpleVT().SimpleTy) {
6540  default:
6541    return false;
6542  case MVT::i8:
6543  case MVT::i16:
6544  case MVT::i32:
6545    return true;
6546  // FIXME: VLD1 etc with standard alignment is legal.
6547  }
6548}
6549
6550static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
6551  if (V < 0)
6552    return false;
6553
6554  unsigned Scale = 1;
6555  switch (VT.getSimpleVT().SimpleTy) {
6556  default: return false;
6557  case MVT::i1:
6558  case MVT::i8:
6559    // Scale == 1;
6560    break;
6561  case MVT::i16:
6562    // Scale == 2;
6563    Scale = 2;
6564    break;
6565  case MVT::i32:
6566    // Scale == 4;
6567    Scale = 4;
6568    break;
6569  }
6570
6571  if ((V & (Scale - 1)) != 0)
6572    return false;
6573  V /= Scale;
6574  return V == (V & ((1LL << 5) - 1));
6575}
6576
6577static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
6578                                      const ARMSubtarget *Subtarget) {
6579  bool isNeg = false;
6580  if (V < 0) {
6581    isNeg = true;
6582    V = - V;
6583  }
6584
6585  switch (VT.getSimpleVT().SimpleTy) {
6586  default: return false;
6587  case MVT::i1:
6588  case MVT::i8:
6589  case MVT::i16:
6590  case MVT::i32:
6591    // + imm12 or - imm8
6592    if (isNeg)
6593      return V == (V & ((1LL << 8) - 1));
6594    return V == (V & ((1LL << 12) - 1));
6595  case MVT::f32:
6596  case MVT::f64:
6597    // Same as ARM mode. FIXME: NEON?
6598    if (!Subtarget->hasVFP2())
6599      return false;
6600    if ((V & 3) != 0)
6601      return false;
6602    V >>= 2;
6603    return V == (V & ((1LL << 8) - 1));
6604  }
6605}
6606
6607/// isLegalAddressImmediate - Return true if the integer value can be used
6608/// as the offset of the target addressing mode for load / store of the
6609/// given type.
6610static bool isLegalAddressImmediate(int64_t V, EVT VT,
6611                                    const ARMSubtarget *Subtarget) {
6612  if (V == 0)
6613    return true;
6614
6615  if (!VT.isSimple())
6616    return false;
6617
6618  if (Subtarget->isThumb1Only())
6619    return isLegalT1AddressImmediate(V, VT);
6620  else if (Subtarget->isThumb2())
6621    return isLegalT2AddressImmediate(V, VT, Subtarget);
6622
6623  // ARM mode.
6624  if (V < 0)
6625    V = - V;
6626  switch (VT.getSimpleVT().SimpleTy) {
6627  default: return false;
6628  case MVT::i1:
6629  case MVT::i8:
6630  case MVT::i32:
6631    // +- imm12
6632    return V == (V & ((1LL << 12) - 1));
6633  case MVT::i16:
6634    // +- imm8
6635    return V == (V & ((1LL << 8) - 1));
6636  case MVT::f32:
6637  case MVT::f64:
6638    if (!Subtarget->hasVFP2()) // FIXME: NEON?
6639      return false;
6640    if ((V & 3) != 0)
6641      return false;
6642    V >>= 2;
6643    return V == (V & ((1LL << 8) - 1));
6644  }
6645}
6646
6647bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
6648                                                      EVT VT) const {
6649  int Scale = AM.Scale;
6650  if (Scale < 0)
6651    return false;
6652
6653  switch (VT.getSimpleVT().SimpleTy) {
6654  default: return false;
6655  case MVT::i1:
6656  case MVT::i8:
6657  case MVT::i16:
6658  case MVT::i32:
6659    if (Scale == 1)
6660      return true;
6661    // r + r << imm
6662    Scale = Scale & ~1;
6663    return Scale == 2 || Scale == 4 || Scale == 8;
6664  case MVT::i64:
6665    // r + r
6666    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
6667      return true;
6668    return false;
6669  case MVT::isVoid:
6670    // Note, we allow "void" uses (basically, uses that aren't loads or
6671    // stores), because arm allows folding a scale into many arithmetic
6672    // operations.  This should be made more precise and revisited later.
6673
6674    // Allow r << imm, but the imm has to be a multiple of two.
6675    if (Scale & 1) return false;
6676    return isPowerOf2_32(Scale);
6677  }
6678}
6679
6680/// isLegalAddressingMode - Return true if the addressing mode represented
6681/// by AM is legal for this target, for a load/store of the specified type.
6682bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
6683                                              const Type *Ty) const {
6684  EVT VT = getValueType(Ty, true);
6685  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
6686    return false;
6687
6688  // Can never fold addr of global into load/store.
6689  if (AM.BaseGV)
6690    return false;
6691
6692  switch (AM.Scale) {
6693  case 0:  // no scale reg, must be "r+i" or "r", or "i".
6694    break;
6695  case 1:
6696    if (Subtarget->isThumb1Only())
6697      return false;
6698    // FALL THROUGH.
6699  default:
6700    // ARM doesn't support any R+R*scale+imm addr modes.
6701    if (AM.BaseOffs)
6702      return false;
6703
6704    if (!VT.isSimple())
6705      return false;
6706
6707    if (Subtarget->isThumb2())
6708      return isLegalT2ScaledAddressingMode(AM, VT);
6709
6710    int Scale = AM.Scale;
6711    switch (VT.getSimpleVT().SimpleTy) {
6712    default: return false;
6713    case MVT::i1:
6714    case MVT::i8:
6715    case MVT::i32:
6716      if (Scale < 0) Scale = -Scale;
6717      if (Scale == 1)
6718        return true;
6719      // r + r << imm
6720      return isPowerOf2_32(Scale & ~1);
6721    case MVT::i16:
6722    case MVT::i64:
6723      // r + r
6724      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
6725        return true;
6726      return false;
6727
6728    case MVT::isVoid:
6729      // Note, we allow "void" uses (basically, uses that aren't loads or
6730      // stores), because arm allows folding a scale into many arithmetic
6731      // operations.  This should be made more precise and revisited later.
6732
6733      // Allow r << imm, but the imm has to be a multiple of two.
6734      if (Scale & 1) return false;
6735      return isPowerOf2_32(Scale);
6736    }
6737    break;
6738  }
6739  return true;
6740}
6741
6742/// isLegalICmpImmediate - Return true if the specified immediate is legal
6743/// icmp immediate, that is the target has icmp instructions which can compare
6744/// a register against the immediate without having to materialize the
6745/// immediate into a register.
6746bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
6747  if (!Subtarget->isThumb())
6748    return ARM_AM::getSOImmVal(Imm) != -1;
6749  if (Subtarget->isThumb2())
6750    return ARM_AM::getT2SOImmVal(Imm) != -1;
6751  return Imm >= 0 && Imm <= 255;
6752}
6753
6754static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
6755                                      bool isSEXTLoad, SDValue &Base,
6756                                      SDValue &Offset, bool &isInc,
6757                                      SelectionDAG &DAG) {
6758  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
6759    return false;
6760
6761  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
6762    // AddressingMode 3
6763    Base = Ptr->getOperand(0);
6764    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
6765      int RHSC = (int)RHS->getZExtValue();
6766      if (RHSC < 0 && RHSC > -256) {
6767        assert(Ptr->getOpcode() == ISD::ADD);
6768        isInc = false;
6769        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
6770        return true;
6771      }
6772    }
6773    isInc = (Ptr->getOpcode() == ISD::ADD);
6774    Offset = Ptr->getOperand(1);
6775    return true;
6776  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
6777    // AddressingMode 2
6778    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
6779      int RHSC = (int)RHS->getZExtValue();
6780      if (RHSC < 0 && RHSC > -0x1000) {
6781        assert(Ptr->getOpcode() == ISD::ADD);
6782        isInc = false;
6783        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
6784        Base = Ptr->getOperand(0);
6785        return true;
6786      }
6787    }
6788
6789    if (Ptr->getOpcode() == ISD::ADD) {
6790      isInc = true;
6791      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
6792      if (ShOpcVal != ARM_AM::no_shift) {
6793        Base = Ptr->getOperand(1);
6794        Offset = Ptr->getOperand(0);
6795      } else {
6796        Base = Ptr->getOperand(0);
6797        Offset = Ptr->getOperand(1);
6798      }
6799      return true;
6800    }
6801
6802    isInc = (Ptr->getOpcode() == ISD::ADD);
6803    Base = Ptr->getOperand(0);
6804    Offset = Ptr->getOperand(1);
6805    return true;
6806  }
6807
6808  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
6809  return false;
6810}
6811
6812static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
6813                                     bool isSEXTLoad, SDValue &Base,
6814                                     SDValue &Offset, bool &isInc,
6815                                     SelectionDAG &DAG) {
6816  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
6817    return false;
6818
6819  Base = Ptr->getOperand(0);
6820  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
6821    int RHSC = (int)RHS->getZExtValue();
6822    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
6823      assert(Ptr->getOpcode() == ISD::ADD);
6824      isInc = false;
6825      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
6826      return true;
6827    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
6828      isInc = Ptr->getOpcode() == ISD::ADD;
6829      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
6830      return true;
6831    }
6832  }
6833
6834  return false;
6835}
6836
6837/// getPreIndexedAddressParts - returns true by value, base pointer and
6838/// offset pointer and addressing mode by reference if the node's address
6839/// can be legally represented as pre-indexed load / store address.
6840bool
6841ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
6842                                             SDValue &Offset,
6843                                             ISD::MemIndexedMode &AM,
6844                                             SelectionDAG &DAG) const {
6845  if (Subtarget->isThumb1Only())
6846    return false;
6847
6848  EVT VT;
6849  SDValue Ptr;
6850  bool isSEXTLoad = false;
6851  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
6852    Ptr = LD->getBasePtr();
6853    VT  = LD->getMemoryVT();
6854    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
6855  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
6856    Ptr = ST->getBasePtr();
6857    VT  = ST->getMemoryVT();
6858  } else
6859    return false;
6860
6861  bool isInc;
6862  bool isLegal = false;
6863  if (Subtarget->isThumb2())
6864    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
6865                                       Offset, isInc, DAG);
6866  else
6867    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
6868                                        Offset, isInc, DAG);
6869  if (!isLegal)
6870    return false;
6871
6872  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
6873  return true;
6874}
6875
6876/// getPostIndexedAddressParts - returns true by value, base pointer and
6877/// offset pointer and addressing mode by reference if this node can be
6878/// combined with a load / store to form a post-indexed load / store.
6879bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
6880                                                   SDValue &Base,
6881                                                   SDValue &Offset,
6882                                                   ISD::MemIndexedMode &AM,
6883                                                   SelectionDAG &DAG) const {
6884  if (Subtarget->isThumb1Only())
6885    return false;
6886
6887  EVT VT;
6888  SDValue Ptr;
6889  bool isSEXTLoad = false;
6890  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
6891    VT  = LD->getMemoryVT();
6892    Ptr = LD->getBasePtr();
6893    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
6894  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
6895    VT  = ST->getMemoryVT();
6896    Ptr = ST->getBasePtr();
6897  } else
6898    return false;
6899
6900  bool isInc;
6901  bool isLegal = false;
6902  if (Subtarget->isThumb2())
6903    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
6904                                       isInc, DAG);
6905  else
6906    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
6907                                        isInc, DAG);
6908  if (!isLegal)
6909    return false;
6910
6911  if (Ptr != Base) {
6912    // Swap base ptr and offset to catch more post-index load / store when
6913    // it's legal. In Thumb2 mode, offset must be an immediate.
6914    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
6915        !Subtarget->isThumb2())
6916      std::swap(Base, Offset);
6917
6918    // Post-indexed load / store update the base pointer.
6919    if (Ptr != Base)
6920      return false;
6921  }
6922
6923  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
6924  return true;
6925}
6926
6927void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
6928                                                       const APInt &Mask,
6929                                                       APInt &KnownZero,
6930                                                       APInt &KnownOne,
6931                                                       const SelectionDAG &DAG,
6932                                                       unsigned Depth) const {
6933  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
6934  switch (Op.getOpcode()) {
6935  default: break;
6936  case ARMISD::CMOV: {
6937    // Bits are known zero/one if known on the LHS and RHS.
6938    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
6939    if (KnownZero == 0 && KnownOne == 0) return;
6940
6941    APInt KnownZeroRHS, KnownOneRHS;
6942    DAG.ComputeMaskedBits(Op.getOperand(1), Mask,
6943                          KnownZeroRHS, KnownOneRHS, Depth+1);
6944    KnownZero &= KnownZeroRHS;
6945    KnownOne  &= KnownOneRHS;
6946    return;
6947  }
6948  }
6949}
6950
6951//===----------------------------------------------------------------------===//
6952//                           ARM Inline Assembly Support
6953//===----------------------------------------------------------------------===//
6954
6955bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
6956  // Looking for "rev" which is V6+.
6957  if (!Subtarget->hasV6Ops())
6958    return false;
6959
6960  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
6961  std::string AsmStr = IA->getAsmString();
6962  SmallVector<StringRef, 4> AsmPieces;
6963  SplitString(AsmStr, AsmPieces, ";\n");
6964
6965  switch (AsmPieces.size()) {
6966  default: return false;
6967  case 1:
6968    AsmStr = AsmPieces[0];
6969    AsmPieces.clear();
6970    SplitString(AsmStr, AsmPieces, " \t,");
6971
6972    // rev $0, $1
6973    if (AsmPieces.size() == 3 &&
6974        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
6975        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
6976      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
6977      if (Ty && Ty->getBitWidth() == 32)
6978        return IntrinsicLowering::LowerToByteSwap(CI);
6979    }
6980    break;
6981  }
6982
6983  return false;
6984}
6985
6986/// getConstraintType - Given a constraint letter, return the type of
6987/// constraint it is for this target.
6988ARMTargetLowering::ConstraintType
6989ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
6990  if (Constraint.size() == 1) {
6991    switch (Constraint[0]) {
6992    default:  break;
6993    case 'l': return C_RegisterClass;
6994    case 'w': return C_RegisterClass;
6995    }
6996  }
6997  return TargetLowering::getConstraintType(Constraint);
6998}
6999
7000/// Examine constraint type and operand type and determine a weight value.
7001/// This object must already have been set up with the operand type
7002/// and the current alternative constraint selected.
7003TargetLowering::ConstraintWeight
7004ARMTargetLowering::getSingleConstraintMatchWeight(
7005    AsmOperandInfo &info, const char *constraint) const {
7006  ConstraintWeight weight = CW_Invalid;
7007  Value *CallOperandVal = info.CallOperandVal;
7008    // If we don't have a value, we can't do a match,
7009    // but allow it at the lowest weight.
7010  if (CallOperandVal == NULL)
7011    return CW_Default;
7012  const Type *type = CallOperandVal->getType();
7013  // Look at the constraint type.
7014  switch (*constraint) {
7015  default:
7016    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
7017    break;
7018  case 'l':
7019    if (type->isIntegerTy()) {
7020      if (Subtarget->isThumb())
7021        weight = CW_SpecificReg;
7022      else
7023        weight = CW_Register;
7024    }
7025    break;
7026  case 'w':
7027    if (type->isFloatingPointTy())
7028      weight = CW_Register;
7029    break;
7030  }
7031  return weight;
7032}
7033
7034std::pair<unsigned, const TargetRegisterClass*>
7035ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
7036                                                EVT VT) const {
7037  if (Constraint.size() == 1) {
7038    // GCC ARM Constraint Letters
7039    switch (Constraint[0]) {
7040    case 'l':
7041      if (Subtarget->isThumb())
7042        return std::make_pair(0U, ARM::tGPRRegisterClass);
7043      else
7044        return std::make_pair(0U, ARM::GPRRegisterClass);
7045    case 'r':
7046      return std::make_pair(0U, ARM::GPRRegisterClass);
7047    case 'w':
7048      if (VT == MVT::f32)
7049        return std::make_pair(0U, ARM::SPRRegisterClass);
7050      if (VT.getSizeInBits() == 64)
7051        return std::make_pair(0U, ARM::DPRRegisterClass);
7052      if (VT.getSizeInBits() == 128)
7053        return std::make_pair(0U, ARM::QPRRegisterClass);
7054      break;
7055    }
7056  }
7057  if (StringRef("{cc}").equals_lower(Constraint))
7058    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
7059
7060  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
7061}
7062
7063std::vector<unsigned> ARMTargetLowering::
7064getRegClassForInlineAsmConstraint(const std::string &Constraint,
7065                                  EVT VT) const {
7066  if (Constraint.size() != 1)
7067    return std::vector<unsigned>();
7068
7069  switch (Constraint[0]) {      // GCC ARM Constraint Letters
7070  default: break;
7071  case 'l':
7072    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
7073                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
7074                                 0);
7075  case 'r':
7076    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
7077                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
7078                                 ARM::R8, ARM::R9, ARM::R10, ARM::R11,
7079                                 ARM::R12, ARM::LR, 0);
7080  case 'w':
7081    if (VT == MVT::f32)
7082      return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3,
7083                                   ARM::S4, ARM::S5, ARM::S6, ARM::S7,
7084                                   ARM::S8, ARM::S9, ARM::S10, ARM::S11,
7085                                   ARM::S12,ARM::S13,ARM::S14,ARM::S15,
7086                                   ARM::S16,ARM::S17,ARM::S18,ARM::S19,
7087                                   ARM::S20,ARM::S21,ARM::S22,ARM::S23,
7088                                   ARM::S24,ARM::S25,ARM::S26,ARM::S27,
7089                                   ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0);
7090    if (VT.getSizeInBits() == 64)
7091      return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3,
7092                                   ARM::D4, ARM::D5, ARM::D6, ARM::D7,
7093                                   ARM::D8, ARM::D9, ARM::D10,ARM::D11,
7094                                   ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0);
7095    if (VT.getSizeInBits() == 128)
7096      return make_vector<unsigned>(ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
7097                                   ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, 0);
7098      break;
7099  }
7100
7101  return std::vector<unsigned>();
7102}
7103
7104/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
7105/// vector.  If it is invalid, don't add anything to Ops.
7106void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
7107                                                     char Constraint,
7108                                                     std::vector<SDValue>&Ops,
7109                                                     SelectionDAG &DAG) const {
7110  SDValue Result(0, 0);
7111
7112  switch (Constraint) {
7113  default: break;
7114  case 'I': case 'J': case 'K': case 'L':
7115  case 'M': case 'N': case 'O':
7116    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
7117    if (!C)
7118      return;
7119
7120    int64_t CVal64 = C->getSExtValue();
7121    int CVal = (int) CVal64;
7122    // None of these constraints allow values larger than 32 bits.  Check
7123    // that the value fits in an int.
7124    if (CVal != CVal64)
7125      return;
7126
7127    switch (Constraint) {
7128      case 'I':
7129        if (Subtarget->isThumb1Only()) {
7130          // This must be a constant between 0 and 255, for ADD
7131          // immediates.
7132          if (CVal >= 0 && CVal <= 255)
7133            break;
7134        } else if (Subtarget->isThumb2()) {
7135          // A constant that can be used as an immediate value in a
7136          // data-processing instruction.
7137          if (ARM_AM::getT2SOImmVal(CVal) != -1)
7138            break;
7139        } else {
7140          // A constant that can be used as an immediate value in a
7141          // data-processing instruction.
7142          if (ARM_AM::getSOImmVal(CVal) != -1)
7143            break;
7144        }
7145        return;
7146
7147      case 'J':
7148        if (Subtarget->isThumb()) {  // FIXME thumb2
7149          // This must be a constant between -255 and -1, for negated ADD
7150          // immediates. This can be used in GCC with an "n" modifier that
7151          // prints the negated value, for use with SUB instructions. It is
7152          // not useful otherwise but is implemented for compatibility.
7153          if (CVal >= -255 && CVal <= -1)
7154            break;
7155        } else {
7156          // This must be a constant between -4095 and 4095. It is not clear
7157          // what this constraint is intended for. Implemented for
7158          // compatibility with GCC.
7159          if (CVal >= -4095 && CVal <= 4095)
7160            break;
7161        }
7162        return;
7163
7164      case 'K':
7165        if (Subtarget->isThumb1Only()) {
7166          // A 32-bit value where only one byte has a nonzero value. Exclude
7167          // zero to match GCC. This constraint is used by GCC internally for
7168          // constants that can be loaded with a move/shift combination.
7169          // It is not useful otherwise but is implemented for compatibility.
7170          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
7171            break;
7172        } else if (Subtarget->isThumb2()) {
7173          // A constant whose bitwise inverse can be used as an immediate
7174          // value in a data-processing instruction. This can be used in GCC
7175          // with a "B" modifier that prints the inverted value, for use with
7176          // BIC and MVN instructions. It is not useful otherwise but is
7177          // implemented for compatibility.
7178          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
7179            break;
7180        } else {
7181          // A constant whose bitwise inverse can be used as an immediate
7182          // value in a data-processing instruction. This can be used in GCC
7183          // with a "B" modifier that prints the inverted value, for use with
7184          // BIC and MVN instructions. It is not useful otherwise but is
7185          // implemented for compatibility.
7186          if (ARM_AM::getSOImmVal(~CVal) != -1)
7187            break;
7188        }
7189        return;
7190
7191      case 'L':
7192        if (Subtarget->isThumb1Only()) {
7193          // This must be a constant between -7 and 7,
7194          // for 3-operand ADD/SUB immediate instructions.
7195          if (CVal >= -7 && CVal < 7)
7196            break;
7197        } else if (Subtarget->isThumb2()) {
7198          // A constant whose negation can be used as an immediate value in a
7199          // data-processing instruction. This can be used in GCC with an "n"
7200          // modifier that prints the negated value, for use with SUB
7201          // instructions. It is not useful otherwise but is implemented for
7202          // compatibility.
7203          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
7204            break;
7205        } else {
7206          // A constant whose negation can be used as an immediate value in a
7207          // data-processing instruction. This can be used in GCC with an "n"
7208          // modifier that prints the negated value, for use with SUB
7209          // instructions. It is not useful otherwise but is implemented for
7210          // compatibility.
7211          if (ARM_AM::getSOImmVal(-CVal) != -1)
7212            break;
7213        }
7214        return;
7215
7216      case 'M':
7217        if (Subtarget->isThumb()) { // FIXME thumb2
7218          // This must be a multiple of 4 between 0 and 1020, for
7219          // ADD sp + immediate.
7220          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
7221            break;
7222        } else {
7223          // A power of two or a constant between 0 and 32.  This is used in
7224          // GCC for the shift amount on shifted register operands, but it is
7225          // useful in general for any shift amounts.
7226          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
7227            break;
7228        }
7229        return;
7230
7231      case 'N':
7232        if (Subtarget->isThumb()) {  // FIXME thumb2
7233          // This must be a constant between 0 and 31, for shift amounts.
7234          if (CVal >= 0 && CVal <= 31)
7235            break;
7236        }
7237        return;
7238
7239      case 'O':
7240        if (Subtarget->isThumb()) {  // FIXME thumb2
7241          // This must be a multiple of 4 between -508 and 508, for
7242          // ADD/SUB sp = sp + immediate.
7243          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
7244            break;
7245        }
7246        return;
7247    }
7248    Result = DAG.getTargetConstant(CVal, Op.getValueType());
7249    break;
7250  }
7251
7252  if (Result.getNode()) {
7253    Ops.push_back(Result);
7254    return;
7255  }
7256  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
7257}
7258
7259bool
7260ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
7261  // The ARM target isn't yet aware of offsets.
7262  return false;
7263}
7264
7265int ARM::getVFPf32Imm(const APFloat &FPImm) {
7266  APInt Imm = FPImm.bitcastToAPInt();
7267  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
7268  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
7269  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
7270
7271  // We can handle 4 bits of mantissa.
7272  // mantissa = (16+UInt(e:f:g:h))/16.
7273  if (Mantissa & 0x7ffff)
7274    return -1;
7275  Mantissa >>= 19;
7276  if ((Mantissa & 0xf) != Mantissa)
7277    return -1;
7278
7279  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
7280  if (Exp < -3 || Exp > 4)
7281    return -1;
7282  Exp = ((Exp+3) & 0x7) ^ 4;
7283
7284  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
7285}
7286
7287int ARM::getVFPf64Imm(const APFloat &FPImm) {
7288  APInt Imm = FPImm.bitcastToAPInt();
7289  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
7290  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
7291  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffLL;
7292
7293  // We can handle 4 bits of mantissa.
7294  // mantissa = (16+UInt(e:f:g:h))/16.
7295  if (Mantissa & 0xffffffffffffLL)
7296    return -1;
7297  Mantissa >>= 48;
7298  if ((Mantissa & 0xf) != Mantissa)
7299    return -1;
7300
7301  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
7302  if (Exp < -3 || Exp > 4)
7303    return -1;
7304  Exp = ((Exp+3) & 0x7) ^ 4;
7305
7306  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
7307}
7308
7309bool ARM::isBitFieldInvertedMask(unsigned v) {
7310  if (v == 0xffffffff)
7311    return 0;
7312  // there can be 1's on either or both "outsides", all the "inside"
7313  // bits must be 0's
7314  unsigned int lsb = 0, msb = 31;
7315  while (v & (1 << msb)) --msb;
7316  while (v & (1 << lsb)) ++lsb;
7317  for (unsigned int i = lsb; i <= msb; ++i) {
7318    if (v & (1 << i))
7319      return 0;
7320  }
7321  return 1;
7322}
7323
7324/// isFPImmLegal - Returns true if the target can instruction select the
7325/// specified FP immediate natively. If false, the legalizer will
7326/// materialize the FP immediate as a load from a constant pool.
7327bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
7328  if (!Subtarget->hasVFP3())
7329    return false;
7330  if (VT == MVT::f32)
7331    return ARM::getVFPf32Imm(Imm) != -1;
7332  if (VT == MVT::f64)
7333    return ARM::getVFPf64Imm(Imm) != -1;
7334  return false;
7335}
7336
7337/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
7338/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
7339/// specified in the intrinsic calls.
7340bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
7341                                           const CallInst &I,
7342                                           unsigned Intrinsic) const {
7343  switch (Intrinsic) {
7344  case Intrinsic::arm_neon_vld1:
7345  case Intrinsic::arm_neon_vld2:
7346  case Intrinsic::arm_neon_vld3:
7347  case Intrinsic::arm_neon_vld4:
7348  case Intrinsic::arm_neon_vld2lane:
7349  case Intrinsic::arm_neon_vld3lane:
7350  case Intrinsic::arm_neon_vld4lane: {
7351    Info.opc = ISD::INTRINSIC_W_CHAIN;
7352    // Conservatively set memVT to the entire set of vectors loaded.
7353    uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8;
7354    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
7355    Info.ptrVal = I.getArgOperand(0);
7356    Info.offset = 0;
7357    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
7358    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
7359    Info.vol = false; // volatile loads with NEON intrinsics not supported
7360    Info.readMem = true;
7361    Info.writeMem = false;
7362    return true;
7363  }
7364  case Intrinsic::arm_neon_vst1:
7365  case Intrinsic::arm_neon_vst2:
7366  case Intrinsic::arm_neon_vst3:
7367  case Intrinsic::arm_neon_vst4:
7368  case Intrinsic::arm_neon_vst2lane:
7369  case Intrinsic::arm_neon_vst3lane:
7370  case Intrinsic::arm_neon_vst4lane: {
7371    Info.opc = ISD::INTRINSIC_VOID;
7372    // Conservatively set memVT to the entire set of vectors stored.
7373    unsigned NumElts = 0;
7374    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
7375      const Type *ArgTy = I.getArgOperand(ArgI)->getType();
7376      if (!ArgTy->isVectorTy())
7377        break;
7378      NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
7379    }
7380    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
7381    Info.ptrVal = I.getArgOperand(0);
7382    Info.offset = 0;
7383    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
7384    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
7385    Info.vol = false; // volatile stores with NEON intrinsics not supported
7386    Info.readMem = false;
7387    Info.writeMem = true;
7388    return true;
7389  }
7390  default:
7391    break;
7392  }
7393
7394  return false;
7395}
7396