AArch64ISelLowering.cpp revision 36c7806f4eacd676932ba630246f88e0e37b1cd4
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that AArch64 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "aarch64-isel" 16#include "AArch64.h" 17#include "AArch64ISelLowering.h" 18#include "AArch64MachineFunctionInfo.h" 19#include "AArch64TargetMachine.h" 20#include "AArch64TargetObjectFile.h" 21#include "Utils/AArch64BaseInfo.h" 22#include "llvm/CodeGen/Analysis.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 28#include "llvm/IR/CallingConv.h" 29 30using namespace llvm; 31 32static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { 33 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 34 35 if (Subtarget->isTargetLinux()) 36 return new AArch64LinuxTargetObjectFile(); 37 if (Subtarget->isTargetELF()) 38 return new TargetLoweringObjectFileELF(); 39 llvm_unreachable("unknown subtarget type"); 40} 41 42AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) 43 : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { 44 45 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 46 47 // SIMD compares set the entire lane's bits to 1 48 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 49 50 // Scalar register <-> type mapping 51 addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); 52 addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); 53 54 if (Subtarget->hasFPARMv8()) { 55 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 56 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 57 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 58 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 59 } 60 61 if (Subtarget->hasNEON()) { 62 // And the vectors 63 addRegisterClass(MVT::v1i8, &AArch64::FPR8RegClass); 64 addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass); 65 addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass); 66 addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); 67 addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass); 68 addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass); 69 addRegisterClass(MVT::v8i8, &AArch64::FPR64RegClass); 70 addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass); 71 addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass); 72 addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass); 73 addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass); 74 addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass); 75 addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass); 76 addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass); 77 addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass); 78 addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass); 79 addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass); 80 } 81 82 computeRegisterProperties(); 83 84 // We combine OR nodes for bitfield and NEON BSL operations. 85 setTargetDAGCombine(ISD::OR); 86 87 setTargetDAGCombine(ISD::AND); 88 setTargetDAGCombine(ISD::SRA); 89 setTargetDAGCombine(ISD::SRL); 90 setTargetDAGCombine(ISD::SHL); 91 92 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 93 setTargetDAGCombine(ISD::INTRINSIC_VOID); 94 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 95 96 // AArch64 does not have i1 loads, or much of anything for i1 really. 97 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 98 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 99 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 100 101 setStackPointerRegisterToSaveRestore(AArch64::XSP); 102 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 103 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 104 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 105 106 // We'll lower globals to wrappers for selection. 107 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 108 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 109 110 // A64 instructions have the comparison predicate attached to the user of the 111 // result, but having a separate comparison is valuable for matching. 112 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 113 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 114 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 115 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 116 117 setOperationAction(ISD::SELECT, MVT::i32, Custom); 118 setOperationAction(ISD::SELECT, MVT::i64, Custom); 119 setOperationAction(ISD::SELECT, MVT::f32, Custom); 120 setOperationAction(ISD::SELECT, MVT::f64, Custom); 121 122 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 123 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 124 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 125 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 126 127 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 128 129 setOperationAction(ISD::SETCC, MVT::i32, Custom); 130 setOperationAction(ISD::SETCC, MVT::i64, Custom); 131 setOperationAction(ISD::SETCC, MVT::f32, Custom); 132 setOperationAction(ISD::SETCC, MVT::f64, Custom); 133 134 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 135 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 136 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 137 138 setOperationAction(ISD::VASTART, MVT::Other, Custom); 139 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 140 setOperationAction(ISD::VAEND, MVT::Other, Expand); 141 setOperationAction(ISD::VAARG, MVT::Other, Expand); 142 143 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 144 145 setOperationAction(ISD::ROTL, MVT::i32, Expand); 146 setOperationAction(ISD::ROTL, MVT::i64, Expand); 147 148 setOperationAction(ISD::UREM, MVT::i32, Expand); 149 setOperationAction(ISD::UREM, MVT::i64, Expand); 150 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 151 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 152 153 setOperationAction(ISD::SREM, MVT::i32, Expand); 154 setOperationAction(ISD::SREM, MVT::i64, Expand); 155 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 156 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 157 158 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 159 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 160 161 // Legal floating-point operations. 162 setOperationAction(ISD::FABS, MVT::f32, Legal); 163 setOperationAction(ISD::FABS, MVT::f64, Legal); 164 165 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 166 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 167 168 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 169 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 170 171 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 172 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 173 174 setOperationAction(ISD::FNEG, MVT::f32, Legal); 175 setOperationAction(ISD::FNEG, MVT::f64, Legal); 176 177 setOperationAction(ISD::FRINT, MVT::f32, Legal); 178 setOperationAction(ISD::FRINT, MVT::f64, Legal); 179 180 setOperationAction(ISD::FSQRT, MVT::f32, Legal); 181 setOperationAction(ISD::FSQRT, MVT::f64, Legal); 182 183 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 184 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 185 186 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 187 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 188 setOperationAction(ISD::ConstantFP, MVT::f128, Legal); 189 190 // Illegal floating-point operations. 191 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 192 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 193 194 setOperationAction(ISD::FCOS, MVT::f32, Expand); 195 setOperationAction(ISD::FCOS, MVT::f64, Expand); 196 197 setOperationAction(ISD::FEXP, MVT::f32, Expand); 198 setOperationAction(ISD::FEXP, MVT::f64, Expand); 199 200 setOperationAction(ISD::FEXP2, MVT::f32, Expand); 201 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 202 203 setOperationAction(ISD::FLOG, MVT::f32, Expand); 204 setOperationAction(ISD::FLOG, MVT::f64, Expand); 205 206 setOperationAction(ISD::FLOG2, MVT::f32, Expand); 207 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 208 209 setOperationAction(ISD::FLOG10, MVT::f32, Expand); 210 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 211 212 setOperationAction(ISD::FPOW, MVT::f32, Expand); 213 setOperationAction(ISD::FPOW, MVT::f64, Expand); 214 215 setOperationAction(ISD::FPOWI, MVT::f32, Expand); 216 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 217 218 setOperationAction(ISD::FREM, MVT::f32, Expand); 219 setOperationAction(ISD::FREM, MVT::f64, Expand); 220 221 setOperationAction(ISD::FSIN, MVT::f32, Expand); 222 setOperationAction(ISD::FSIN, MVT::f64, Expand); 223 224 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 225 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 226 227 // Virtually no operation on f128 is legal, but LLVM can't expand them when 228 // there's a valid register class, so we need custom operations in most cases. 229 setOperationAction(ISD::FABS, MVT::f128, Expand); 230 setOperationAction(ISD::FADD, MVT::f128, Custom); 231 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 232 setOperationAction(ISD::FCOS, MVT::f128, Expand); 233 setOperationAction(ISD::FDIV, MVT::f128, Custom); 234 setOperationAction(ISD::FMA, MVT::f128, Expand); 235 setOperationAction(ISD::FMUL, MVT::f128, Custom); 236 setOperationAction(ISD::FNEG, MVT::f128, Expand); 237 setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); 238 setOperationAction(ISD::FP_ROUND, MVT::f128, Expand); 239 setOperationAction(ISD::FPOW, MVT::f128, Expand); 240 setOperationAction(ISD::FREM, MVT::f128, Expand); 241 setOperationAction(ISD::FRINT, MVT::f128, Expand); 242 setOperationAction(ISD::FSIN, MVT::f128, Expand); 243 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 244 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 245 setOperationAction(ISD::FSUB, MVT::f128, Custom); 246 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 247 setOperationAction(ISD::SETCC, MVT::f128, Custom); 248 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 249 setOperationAction(ISD::SELECT, MVT::f128, Expand); 250 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 251 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 252 253 // Lowering for many of the conversions is actually specified by the non-f128 254 // type. The LowerXXX function will be trivial when f128 isn't involved. 255 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 256 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 257 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 258 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 259 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 260 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 261 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 262 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 263 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 264 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 265 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 266 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 267 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 268 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 269 270 // This prevents LLVM trying to compress double constants into a floating 271 // constant-pool entry and trying to load from there. It's of doubtful benefit 272 // for A64: we'd need LDR followed by FCVT, I believe. 273 setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); 274 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 275 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 276 277 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 278 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 279 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 280 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 281 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 282 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 283 284 setExceptionPointerRegister(AArch64::X0); 285 setExceptionSelectorRegister(AArch64::X1); 286 287 if (Subtarget->hasNEON()) { 288 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom); 289 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 290 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 291 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom); 292 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 293 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 294 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom); 295 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 296 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 297 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 298 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 299 setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom); 300 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 301 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 302 setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); 303 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 304 305 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 306 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); 307 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 308 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); 309 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 310 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); 311 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 312 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 313 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); 314 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 315 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); 316 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 317 318 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal); 319 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); 320 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); 321 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); 322 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); 323 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); 324 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); 325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal); 326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal); 327 328 setOperationAction(ISD::SETCC, MVT::v8i8, Custom); 329 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 330 setOperationAction(ISD::SETCC, MVT::v4i16, Custom); 331 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 332 setOperationAction(ISD::SETCC, MVT::v2i32, Custom); 333 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 334 setOperationAction(ISD::SETCC, MVT::v1i64, Custom); 335 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 336 setOperationAction(ISD::SETCC, MVT::v1f32, Custom); 337 setOperationAction(ISD::SETCC, MVT::v2f32, Custom); 338 setOperationAction(ISD::SETCC, MVT::v4f32, Custom); 339 setOperationAction(ISD::SETCC, MVT::v1f64, Custom); 340 setOperationAction(ISD::SETCC, MVT::v2f64, Custom); 341 342 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal); 343 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); 344 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); 345 346 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal); 347 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); 348 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); 349 350 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal); 351 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); 352 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); 353 354 setOperationAction(ISD::FRINT, MVT::v2f32, Legal); 355 setOperationAction(ISD::FRINT, MVT::v4f32, Legal); 356 setOperationAction(ISD::FRINT, MVT::v2f64, Legal); 357 358 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal); 359 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); 360 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); 361 362 setOperationAction(ISD::FROUND, MVT::v2f32, Legal); 363 setOperationAction(ISD::FROUND, MVT::v4f32, Legal); 364 setOperationAction(ISD::FROUND, MVT::v2f64, Legal); 365 } 366} 367 368EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 369 // It's reasonably important that this value matches the "natural" legal 370 // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself 371 // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). 372 if (!VT.isVector()) return MVT::i32; 373 return VT.changeVectorElementTypeToInteger(); 374} 375 376static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, 377 unsigned &LdrOpc, 378 unsigned &StrOpc) { 379 static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, 380 AArch64::LDXR_word, AArch64::LDXR_dword}; 381 static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, 382 AArch64::LDAXR_word, AArch64::LDAXR_dword}; 383 static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, 384 AArch64::STXR_word, AArch64::STXR_dword}; 385 static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword, 386 AArch64::STLXR_word, AArch64::STLXR_dword}; 387 388 const unsigned *LoadOps, *StoreOps; 389 if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) 390 LoadOps = LoadAcqs; 391 else 392 LoadOps = LoadBares; 393 394 if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) 395 StoreOps = StoreRels; 396 else 397 StoreOps = StoreBares; 398 399 assert(isPowerOf2_32(Size) && Size <= 8 && 400 "unsupported size for atomic binary op!"); 401 402 LdrOpc = LoadOps[Log2_32(Size)]; 403 StrOpc = StoreOps[Log2_32(Size)]; 404} 405 406MachineBasicBlock * 407AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, 408 unsigned Size, 409 unsigned BinOpcode) const { 410 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 411 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 412 413 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 414 MachineFunction *MF = BB->getParent(); 415 MachineFunction::iterator It = BB; 416 ++It; 417 418 unsigned dest = MI->getOperand(0).getReg(); 419 unsigned ptr = MI->getOperand(1).getReg(); 420 unsigned incr = MI->getOperand(2).getReg(); 421 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 422 DebugLoc dl = MI->getDebugLoc(); 423 424 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 425 426 unsigned ldrOpc, strOpc; 427 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 428 429 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 430 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 431 MF->insert(It, loopMBB); 432 MF->insert(It, exitMBB); 433 434 // Transfer the remainder of BB and its successor edges to exitMBB. 435 exitMBB->splice(exitMBB->begin(), BB, 436 llvm::next(MachineBasicBlock::iterator(MI)), 437 BB->end()); 438 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 439 440 const TargetRegisterClass *TRC 441 = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; 442 unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); 443 444 // thisMBB: 445 // ... 446 // fallthrough --> loopMBB 447 BB->addSuccessor(loopMBB); 448 449 // loopMBB: 450 // ldxr dest, ptr 451 // <binop> scratch, dest, incr 452 // stxr stxr_status, scratch, ptr 453 // cbnz stxr_status, loopMBB 454 // fallthrough --> exitMBB 455 BB = loopMBB; 456 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 457 if (BinOpcode) { 458 // All arithmetic operations we'll be creating are designed to take an extra 459 // shift or extend operand, which we can conveniently set to zero. 460 461 // Operand order needs to go the other way for NAND. 462 if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl) 463 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 464 .addReg(incr).addReg(dest).addImm(0); 465 else 466 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 467 .addReg(dest).addReg(incr).addImm(0); 468 } 469 470 // From the stxr, the register is GPR32; from the cmp it's GPR32wsp 471 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 472 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 473 474 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr); 475 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 476 .addReg(stxr_status).addMBB(loopMBB); 477 478 BB->addSuccessor(loopMBB); 479 BB->addSuccessor(exitMBB); 480 481 // exitMBB: 482 // ... 483 BB = exitMBB; 484 485 MI->eraseFromParent(); // The instruction is gone now. 486 487 return BB; 488} 489 490MachineBasicBlock * 491AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI, 492 MachineBasicBlock *BB, 493 unsigned Size, 494 unsigned CmpOp, 495 A64CC::CondCodes Cond) const { 496 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 497 498 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 499 MachineFunction *MF = BB->getParent(); 500 MachineFunction::iterator It = BB; 501 ++It; 502 503 unsigned dest = MI->getOperand(0).getReg(); 504 unsigned ptr = MI->getOperand(1).getReg(); 505 unsigned incr = MI->getOperand(2).getReg(); 506 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 507 508 unsigned oldval = dest; 509 DebugLoc dl = MI->getDebugLoc(); 510 511 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 512 const TargetRegisterClass *TRC, *TRCsp; 513 if (Size == 8) { 514 TRC = &AArch64::GPR64RegClass; 515 TRCsp = &AArch64::GPR64xspRegClass; 516 } else { 517 TRC = &AArch64::GPR32RegClass; 518 TRCsp = &AArch64::GPR32wspRegClass; 519 } 520 521 unsigned ldrOpc, strOpc; 522 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 523 524 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 525 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 526 MF->insert(It, loopMBB); 527 MF->insert(It, exitMBB); 528 529 // Transfer the remainder of BB and its successor edges to exitMBB. 530 exitMBB->splice(exitMBB->begin(), BB, 531 llvm::next(MachineBasicBlock::iterator(MI)), 532 BB->end()); 533 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 534 535 unsigned scratch = MRI.createVirtualRegister(TRC); 536 MRI.constrainRegClass(scratch, TRCsp); 537 538 // thisMBB: 539 // ... 540 // fallthrough --> loopMBB 541 BB->addSuccessor(loopMBB); 542 543 // loopMBB: 544 // ldxr dest, ptr 545 // cmp incr, dest (, sign extend if necessary) 546 // csel scratch, dest, incr, cond 547 // stxr stxr_status, scratch, ptr 548 // cbnz stxr_status, loopMBB 549 // fallthrough --> exitMBB 550 BB = loopMBB; 551 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 552 553 // Build compare and cmov instructions. 554 MRI.constrainRegClass(incr, TRCsp); 555 BuildMI(BB, dl, TII->get(CmpOp)) 556 .addReg(incr).addReg(oldval).addImm(0); 557 558 BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc), 559 scratch) 560 .addReg(oldval).addReg(incr).addImm(Cond); 561 562 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 563 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 564 565 BuildMI(BB, dl, TII->get(strOpc), stxr_status) 566 .addReg(scratch).addReg(ptr); 567 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 568 .addReg(stxr_status).addMBB(loopMBB); 569 570 BB->addSuccessor(loopMBB); 571 BB->addSuccessor(exitMBB); 572 573 // exitMBB: 574 // ... 575 BB = exitMBB; 576 577 MI->eraseFromParent(); // The instruction is gone now. 578 579 return BB; 580} 581 582MachineBasicBlock * 583AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI, 584 MachineBasicBlock *BB, 585 unsigned Size) const { 586 unsigned dest = MI->getOperand(0).getReg(); 587 unsigned ptr = MI->getOperand(1).getReg(); 588 unsigned oldval = MI->getOperand(2).getReg(); 589 unsigned newval = MI->getOperand(3).getReg(); 590 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm()); 591 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 592 DebugLoc dl = MI->getDebugLoc(); 593 594 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 595 const TargetRegisterClass *TRCsp; 596 TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass; 597 598 unsigned ldrOpc, strOpc; 599 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 600 601 MachineFunction *MF = BB->getParent(); 602 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 603 MachineFunction::iterator It = BB; 604 ++It; // insert the new blocks after the current block 605 606 MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); 607 MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); 608 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 609 MF->insert(It, loop1MBB); 610 MF->insert(It, loop2MBB); 611 MF->insert(It, exitMBB); 612 613 // Transfer the remainder of BB and its successor edges to exitMBB. 614 exitMBB->splice(exitMBB->begin(), BB, 615 llvm::next(MachineBasicBlock::iterator(MI)), 616 BB->end()); 617 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 618 619 // thisMBB: 620 // ... 621 // fallthrough --> loop1MBB 622 BB->addSuccessor(loop1MBB); 623 624 // loop1MBB: 625 // ldxr dest, [ptr] 626 // cmp dest, oldval 627 // b.ne exitMBB 628 BB = loop1MBB; 629 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 630 631 unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl; 632 MRI.constrainRegClass(dest, TRCsp); 633 BuildMI(BB, dl, TII->get(CmpOp)) 634 .addReg(dest).addReg(oldval).addImm(0); 635 BuildMI(BB, dl, TII->get(AArch64::Bcc)) 636 .addImm(A64CC::NE).addMBB(exitMBB); 637 BB->addSuccessor(loop2MBB); 638 BB->addSuccessor(exitMBB); 639 640 // loop2MBB: 641 // strex stxr_status, newval, [ptr] 642 // cbnz stxr_status, loop1MBB 643 BB = loop2MBB; 644 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 645 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 646 647 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr); 648 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 649 .addReg(stxr_status).addMBB(loop1MBB); 650 BB->addSuccessor(loop1MBB); 651 BB->addSuccessor(exitMBB); 652 653 // exitMBB: 654 // ... 655 BB = exitMBB; 656 657 MI->eraseFromParent(); // The instruction is gone now. 658 659 return BB; 660} 661 662MachineBasicBlock * 663AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 664 MachineBasicBlock *MBB) const { 665 // We materialise the F128CSEL pseudo-instruction using conditional branches 666 // and loads, giving an instruciton sequence like: 667 // str q0, [sp] 668 // b.ne IfTrue 669 // b Finish 670 // IfTrue: 671 // str q1, [sp] 672 // Finish: 673 // ldr q0, [sp] 674 // 675 // Using virtual registers would probably not be beneficial since COPY 676 // instructions are expensive for f128 (there's no actual instruction to 677 // implement them). 678 // 679 // An alternative would be to do an integer-CSEL on some address. E.g.: 680 // mov x0, sp 681 // add x1, sp, #16 682 // str q0, [x0] 683 // str q1, [x1] 684 // csel x0, x0, x1, ne 685 // ldr q0, [x0] 686 // 687 // It's unclear which approach is actually optimal. 688 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 689 MachineFunction *MF = MBB->getParent(); 690 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 691 DebugLoc DL = MI->getDebugLoc(); 692 MachineFunction::iterator It = MBB; 693 ++It; 694 695 unsigned DestReg = MI->getOperand(0).getReg(); 696 unsigned IfTrueReg = MI->getOperand(1).getReg(); 697 unsigned IfFalseReg = MI->getOperand(2).getReg(); 698 unsigned CondCode = MI->getOperand(3).getImm(); 699 bool NZCVKilled = MI->getOperand(4).isKill(); 700 701 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 702 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 703 MF->insert(It, TrueBB); 704 MF->insert(It, EndBB); 705 706 // Transfer rest of current basic-block to EndBB 707 EndBB->splice(EndBB->begin(), MBB, 708 llvm::next(MachineBasicBlock::iterator(MI)), 709 MBB->end()); 710 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 711 712 // We need somewhere to store the f128 value needed. 713 int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16); 714 715 // [... start of incoming MBB ...] 716 // str qIFFALSE, [sp] 717 // b.cc IfTrue 718 // b Done 719 BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR)) 720 .addReg(IfFalseReg) 721 .addFrameIndex(ScratchFI) 722 .addImm(0); 723 BuildMI(MBB, DL, TII->get(AArch64::Bcc)) 724 .addImm(CondCode) 725 .addMBB(TrueBB); 726 BuildMI(MBB, DL, TII->get(AArch64::Bimm)) 727 .addMBB(EndBB); 728 MBB->addSuccessor(TrueBB); 729 MBB->addSuccessor(EndBB); 730 731 if (!NZCVKilled) { 732 // NZCV is live-through TrueBB. 733 TrueBB->addLiveIn(AArch64::NZCV); 734 EndBB->addLiveIn(AArch64::NZCV); 735 } 736 737 // IfTrue: 738 // str qIFTRUE, [sp] 739 BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) 740 .addReg(IfTrueReg) 741 .addFrameIndex(ScratchFI) 742 .addImm(0); 743 744 // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the 745 // blocks. 746 TrueBB->addSuccessor(EndBB); 747 748 // Done: 749 // ldr qDEST, [sp] 750 // [... rest of incoming MBB ...] 751 MachineInstr *StartOfEnd = EndBB->begin(); 752 BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) 753 .addFrameIndex(ScratchFI) 754 .addImm(0); 755 756 MI->eraseFromParent(); 757 return EndBB; 758} 759 760MachineBasicBlock * 761AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 762 MachineBasicBlock *MBB) const { 763 switch (MI->getOpcode()) { 764 default: llvm_unreachable("Unhandled instruction with custom inserter"); 765 case AArch64::F128CSEL: 766 return EmitF128CSEL(MI, MBB); 767 case AArch64::ATOMIC_LOAD_ADD_I8: 768 return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl); 769 case AArch64::ATOMIC_LOAD_ADD_I16: 770 return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl); 771 case AArch64::ATOMIC_LOAD_ADD_I32: 772 return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl); 773 case AArch64::ATOMIC_LOAD_ADD_I64: 774 return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl); 775 776 case AArch64::ATOMIC_LOAD_SUB_I8: 777 return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl); 778 case AArch64::ATOMIC_LOAD_SUB_I16: 779 return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl); 780 case AArch64::ATOMIC_LOAD_SUB_I32: 781 return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl); 782 case AArch64::ATOMIC_LOAD_SUB_I64: 783 return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl); 784 785 case AArch64::ATOMIC_LOAD_AND_I8: 786 return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl); 787 case AArch64::ATOMIC_LOAD_AND_I16: 788 return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl); 789 case AArch64::ATOMIC_LOAD_AND_I32: 790 return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl); 791 case AArch64::ATOMIC_LOAD_AND_I64: 792 return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl); 793 794 case AArch64::ATOMIC_LOAD_OR_I8: 795 return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl); 796 case AArch64::ATOMIC_LOAD_OR_I16: 797 return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl); 798 case AArch64::ATOMIC_LOAD_OR_I32: 799 return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl); 800 case AArch64::ATOMIC_LOAD_OR_I64: 801 return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl); 802 803 case AArch64::ATOMIC_LOAD_XOR_I8: 804 return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl); 805 case AArch64::ATOMIC_LOAD_XOR_I16: 806 return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl); 807 case AArch64::ATOMIC_LOAD_XOR_I32: 808 return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl); 809 case AArch64::ATOMIC_LOAD_XOR_I64: 810 return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl); 811 812 case AArch64::ATOMIC_LOAD_NAND_I8: 813 return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl); 814 case AArch64::ATOMIC_LOAD_NAND_I16: 815 return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl); 816 case AArch64::ATOMIC_LOAD_NAND_I32: 817 return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl); 818 case AArch64::ATOMIC_LOAD_NAND_I64: 819 return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl); 820 821 case AArch64::ATOMIC_LOAD_MIN_I8: 822 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT); 823 case AArch64::ATOMIC_LOAD_MIN_I16: 824 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT); 825 case AArch64::ATOMIC_LOAD_MIN_I32: 826 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT); 827 case AArch64::ATOMIC_LOAD_MIN_I64: 828 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT); 829 830 case AArch64::ATOMIC_LOAD_MAX_I8: 831 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT); 832 case AArch64::ATOMIC_LOAD_MAX_I16: 833 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT); 834 case AArch64::ATOMIC_LOAD_MAX_I32: 835 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT); 836 case AArch64::ATOMIC_LOAD_MAX_I64: 837 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT); 838 839 case AArch64::ATOMIC_LOAD_UMIN_I8: 840 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI); 841 case AArch64::ATOMIC_LOAD_UMIN_I16: 842 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI); 843 case AArch64::ATOMIC_LOAD_UMIN_I32: 844 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI); 845 case AArch64::ATOMIC_LOAD_UMIN_I64: 846 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI); 847 848 case AArch64::ATOMIC_LOAD_UMAX_I8: 849 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO); 850 case AArch64::ATOMIC_LOAD_UMAX_I16: 851 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO); 852 case AArch64::ATOMIC_LOAD_UMAX_I32: 853 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO); 854 case AArch64::ATOMIC_LOAD_UMAX_I64: 855 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO); 856 857 case AArch64::ATOMIC_SWAP_I8: 858 return emitAtomicBinary(MI, MBB, 1, 0); 859 case AArch64::ATOMIC_SWAP_I16: 860 return emitAtomicBinary(MI, MBB, 2, 0); 861 case AArch64::ATOMIC_SWAP_I32: 862 return emitAtomicBinary(MI, MBB, 4, 0); 863 case AArch64::ATOMIC_SWAP_I64: 864 return emitAtomicBinary(MI, MBB, 8, 0); 865 866 case AArch64::ATOMIC_CMP_SWAP_I8: 867 return emitAtomicCmpSwap(MI, MBB, 1); 868 case AArch64::ATOMIC_CMP_SWAP_I16: 869 return emitAtomicCmpSwap(MI, MBB, 2); 870 case AArch64::ATOMIC_CMP_SWAP_I32: 871 return emitAtomicCmpSwap(MI, MBB, 4); 872 case AArch64::ATOMIC_CMP_SWAP_I64: 873 return emitAtomicCmpSwap(MI, MBB, 8); 874 } 875} 876 877 878const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 879 switch (Opcode) { 880 case AArch64ISD::BR_CC: return "AArch64ISD::BR_CC"; 881 case AArch64ISD::Call: return "AArch64ISD::Call"; 882 case AArch64ISD::FPMOV: return "AArch64ISD::FPMOV"; 883 case AArch64ISD::GOTLoad: return "AArch64ISD::GOTLoad"; 884 case AArch64ISD::BFI: return "AArch64ISD::BFI"; 885 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 886 case AArch64ISD::Ret: return "AArch64ISD::Ret"; 887 case AArch64ISD::SBFX: return "AArch64ISD::SBFX"; 888 case AArch64ISD::SELECT_CC: return "AArch64ISD::SELECT_CC"; 889 case AArch64ISD::SETCC: return "AArch64ISD::SETCC"; 890 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 891 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 892 case AArch64ISD::TLSDESCCALL: return "AArch64ISD::TLSDESCCALL"; 893 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 894 case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; 895 896 case AArch64ISD::NEON_BSL: 897 return "AArch64ISD::NEON_BSL"; 898 case AArch64ISD::NEON_MOVIMM: 899 return "AArch64ISD::NEON_MOVIMM"; 900 case AArch64ISD::NEON_MVNIMM: 901 return "AArch64ISD::NEON_MVNIMM"; 902 case AArch64ISD::NEON_FMOVIMM: 903 return "AArch64ISD::NEON_FMOVIMM"; 904 case AArch64ISD::NEON_CMP: 905 return "AArch64ISD::NEON_CMP"; 906 case AArch64ISD::NEON_CMPZ: 907 return "AArch64ISD::NEON_CMPZ"; 908 case AArch64ISD::NEON_TST: 909 return "AArch64ISD::NEON_TST"; 910 case AArch64ISD::NEON_QSHLs: 911 return "AArch64ISD::NEON_QSHLs"; 912 case AArch64ISD::NEON_QSHLu: 913 return "AArch64ISD::NEON_QSHLu"; 914 case AArch64ISD::NEON_VDUP: 915 return "AArch64ISD::NEON_VDUP"; 916 case AArch64ISD::NEON_VDUPLANE: 917 return "AArch64ISD::NEON_VDUPLANE"; 918 case AArch64ISD::NEON_REV16: 919 return "AArch64ISD::NEON_REV16"; 920 case AArch64ISD::NEON_REV32: 921 return "AArch64ISD::NEON_REV32"; 922 case AArch64ISD::NEON_REV64: 923 return "AArch64ISD::NEON_REV64"; 924 case AArch64ISD::NEON_LD1_UPD: 925 return "AArch64ISD::NEON_LD1_UPD"; 926 case AArch64ISD::NEON_LD2_UPD: 927 return "AArch64ISD::NEON_LD2_UPD"; 928 case AArch64ISD::NEON_LD3_UPD: 929 return "AArch64ISD::NEON_LD3_UPD"; 930 case AArch64ISD::NEON_LD4_UPD: 931 return "AArch64ISD::NEON_LD4_UPD"; 932 case AArch64ISD::NEON_ST1_UPD: 933 return "AArch64ISD::NEON_ST1_UPD"; 934 case AArch64ISD::NEON_ST2_UPD: 935 return "AArch64ISD::NEON_ST2_UPD"; 936 case AArch64ISD::NEON_ST3_UPD: 937 return "AArch64ISD::NEON_ST3_UPD"; 938 case AArch64ISD::NEON_ST4_UPD: 939 return "AArch64ISD::NEON_ST4_UPD"; 940 case AArch64ISD::NEON_LD1x2_UPD: 941 return "AArch64ISD::NEON_LD1x2_UPD"; 942 case AArch64ISD::NEON_LD1x3_UPD: 943 return "AArch64ISD::NEON_LD1x3_UPD"; 944 case AArch64ISD::NEON_LD1x4_UPD: 945 return "AArch64ISD::NEON_LD1x4_UPD"; 946 case AArch64ISD::NEON_ST1x2_UPD: 947 return "AArch64ISD::NEON_ST1x2_UPD"; 948 case AArch64ISD::NEON_ST1x3_UPD: 949 return "AArch64ISD::NEON_ST1x3_UPD"; 950 case AArch64ISD::NEON_ST1x4_UPD: 951 return "AArch64ISD::NEON_ST1x4_UPD"; 952 case AArch64ISD::NEON_LD2DUP: 953 return "AArch64ISD::NEON_LD2DUP"; 954 case AArch64ISD::NEON_LD3DUP: 955 return "AArch64ISD::NEON_LD3DUP"; 956 case AArch64ISD::NEON_LD4DUP: 957 return "AArch64ISD::NEON_LD4DUP"; 958 case AArch64ISD::NEON_LD2DUP_UPD: 959 return "AArch64ISD::NEON_LD2DUP_UPD"; 960 case AArch64ISD::NEON_LD3DUP_UPD: 961 return "AArch64ISD::NEON_LD3DUP_UPD"; 962 case AArch64ISD::NEON_LD4DUP_UPD: 963 return "AArch64ISD::NEON_LD4DUP_UPD"; 964 case AArch64ISD::NEON_LD2LN_UPD: 965 return "AArch64ISD::NEON_LD2LN_UPD"; 966 case AArch64ISD::NEON_LD3LN_UPD: 967 return "AArch64ISD::NEON_LD3LN_UPD"; 968 case AArch64ISD::NEON_LD4LN_UPD: 969 return "AArch64ISD::NEON_LD4LN_UPD"; 970 case AArch64ISD::NEON_ST2LN_UPD: 971 return "AArch64ISD::NEON_ST2LN_UPD"; 972 case AArch64ISD::NEON_ST3LN_UPD: 973 return "AArch64ISD::NEON_ST3LN_UPD"; 974 case AArch64ISD::NEON_ST4LN_UPD: 975 return "AArch64ISD::NEON_ST4LN_UPD"; 976 case AArch64ISD::NEON_VEXTRACT: 977 return "AArch64ISD::NEON_VEXTRACT"; 978 default: 979 return NULL; 980 } 981} 982 983static const uint16_t AArch64FPRArgRegs[] = { 984 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 985 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7 986}; 987static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs); 988 989static const uint16_t AArch64ArgRegs[] = { 990 AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, 991 AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 992}; 993static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs); 994 995static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT, 996 CCValAssign::LocInfo LocInfo, 997 ISD::ArgFlagsTy ArgFlags, CCState &State) { 998 // Mark all remaining general purpose registers as allocated. We don't 999 // backtrack: if (for example) an i128 gets put on the stack, no subsequent 1000 // i64 will go in registers (C.11). 1001 for (unsigned i = 0; i < NumArgRegs; ++i) 1002 State.AllocateReg(AArch64ArgRegs[i]); 1003 1004 return false; 1005} 1006 1007#include "AArch64GenCallingConv.inc" 1008 1009CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1010 1011 switch(CC) { 1012 default: llvm_unreachable("Unsupported calling convention"); 1013 case CallingConv::Fast: 1014 case CallingConv::C: 1015 return CC_A64_APCS; 1016 } 1017} 1018 1019void 1020AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, 1021 SDLoc DL, SDValue &Chain) const { 1022 MachineFunction &MF = DAG.getMachineFunction(); 1023 MachineFrameInfo *MFI = MF.getFrameInfo(); 1024 AArch64MachineFunctionInfo *FuncInfo 1025 = MF.getInfo<AArch64MachineFunctionInfo>(); 1026 1027 SmallVector<SDValue, 8> MemOps; 1028 1029 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs, 1030 NumArgRegs); 1031 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs, 1032 NumFPRArgRegs); 1033 1034 unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR); 1035 int GPRIdx = 0; 1036 if (GPRSaveSize != 0) { 1037 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 1038 1039 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 1040 1041 for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) { 1042 unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass); 1043 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 1044 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 1045 MachinePointerInfo::getStack(i * 8), 1046 false, false, 0); 1047 MemOps.push_back(Store); 1048 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 1049 DAG.getConstant(8, getPointerTy())); 1050 } 1051 } 1052 1053 if (getSubtarget()->hasFPARMv8()) { 1054 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 1055 int FPRIdx = 0; 1056 // According to the AArch64 Procedure Call Standard, section B.1/B.3, we 1057 // can omit a register save area if we know we'll never use registers of 1058 // that class. 1059 if (FPRSaveSize != 0) { 1060 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 1061 1062 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 1063 1064 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 1065 unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], 1066 &AArch64::FPR128RegClass); 1067 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 1068 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 1069 MachinePointerInfo::getStack(i * 16), 1070 false, false, 0); 1071 MemOps.push_back(Store); 1072 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 1073 DAG.getConstant(16, getPointerTy())); 1074 } 1075 } 1076 FuncInfo->setVariadicFPRIdx(FPRIdx); 1077 FuncInfo->setVariadicFPRSize(FPRSaveSize); 1078 } 1079 1080 int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); 1081 1082 FuncInfo->setVariadicStackIdx(StackIdx); 1083 FuncInfo->setVariadicGPRIdx(GPRIdx); 1084 FuncInfo->setVariadicGPRSize(GPRSaveSize); 1085 1086 if (!MemOps.empty()) { 1087 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 1088 MemOps.size()); 1089 } 1090} 1091 1092 1093SDValue 1094AArch64TargetLowering::LowerFormalArguments(SDValue Chain, 1095 CallingConv::ID CallConv, bool isVarArg, 1096 const SmallVectorImpl<ISD::InputArg> &Ins, 1097 SDLoc dl, SelectionDAG &DAG, 1098 SmallVectorImpl<SDValue> &InVals) const { 1099 MachineFunction &MF = DAG.getMachineFunction(); 1100 AArch64MachineFunctionInfo *FuncInfo 1101 = MF.getInfo<AArch64MachineFunctionInfo>(); 1102 MachineFrameInfo *MFI = MF.getFrameInfo(); 1103 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 1104 1105 SmallVector<CCValAssign, 16> ArgLocs; 1106 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1107 getTargetMachine(), ArgLocs, *DAG.getContext()); 1108 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1109 1110 SmallVector<SDValue, 16> ArgValues; 1111 1112 SDValue ArgValue; 1113 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1114 CCValAssign &VA = ArgLocs[i]; 1115 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1116 1117 if (Flags.isByVal()) { 1118 // Byval is used for small structs and HFAs in the PCS, but the system 1119 // should work in a non-compliant manner for larger structs. 1120 EVT PtrTy = getPointerTy(); 1121 int Size = Flags.getByValSize(); 1122 unsigned NumRegs = (Size + 7) / 8; 1123 1124 unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, 1125 VA.getLocMemOffset(), 1126 false); 1127 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 1128 InVals.push_back(FrameIdxN); 1129 1130 continue; 1131 } else if (VA.isRegLoc()) { 1132 MVT RegVT = VA.getLocVT(); 1133 const TargetRegisterClass *RC = getRegClassFor(RegVT); 1134 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1135 1136 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1137 } else { // VA.isRegLoc() 1138 assert(VA.isMemLoc()); 1139 1140 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 1141 VA.getLocMemOffset(), true); 1142 1143 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1144 ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, 1145 MachinePointerInfo::getFixedStack(FI), 1146 false, false, false, 0); 1147 1148 1149 } 1150 1151 switch (VA.getLocInfo()) { 1152 default: llvm_unreachable("Unknown loc info!"); 1153 case CCValAssign::Full: break; 1154 case CCValAssign::BCvt: 1155 ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue); 1156 break; 1157 case CCValAssign::SExt: 1158 case CCValAssign::ZExt: 1159 case CCValAssign::AExt: { 1160 unsigned DestSize = VA.getValVT().getSizeInBits(); 1161 unsigned DestSubReg; 1162 1163 switch (DestSize) { 1164 case 8: DestSubReg = AArch64::sub_8; break; 1165 case 16: DestSubReg = AArch64::sub_16; break; 1166 case 32: DestSubReg = AArch64::sub_32; break; 1167 case 64: DestSubReg = AArch64::sub_64; break; 1168 default: llvm_unreachable("Unexpected argument promotion"); 1169 } 1170 1171 ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, 1172 VA.getValVT(), ArgValue, 1173 DAG.getTargetConstant(DestSubReg, MVT::i32)), 1174 0); 1175 break; 1176 } 1177 } 1178 1179 InVals.push_back(ArgValue); 1180 } 1181 1182 if (isVarArg) 1183 SaveVarArgRegisters(CCInfo, DAG, dl, Chain); 1184 1185 unsigned StackArgSize = CCInfo.getNextStackOffset(); 1186 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 1187 // This is a non-standard ABI so by fiat I say we're allowed to make full 1188 // use of the stack area to be popped, which must be aligned to 16 bytes in 1189 // any case: 1190 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 1191 1192 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 1193 // a multiple of 16. 1194 FuncInfo->setArgumentStackToRestore(StackArgSize); 1195 1196 // This realignment carries over to the available bytes below. Our own 1197 // callers will guarantee the space is free by giving an aligned value to 1198 // CALLSEQ_START. 1199 } 1200 // Even if we're not expected to free up the space, it's useful to know how 1201 // much is there while considering tail calls (because we can reuse it). 1202 FuncInfo->setBytesInStackArgArea(StackArgSize); 1203 1204 return Chain; 1205} 1206 1207SDValue 1208AArch64TargetLowering::LowerReturn(SDValue Chain, 1209 CallingConv::ID CallConv, bool isVarArg, 1210 const SmallVectorImpl<ISD::OutputArg> &Outs, 1211 const SmallVectorImpl<SDValue> &OutVals, 1212 SDLoc dl, SelectionDAG &DAG) const { 1213 // CCValAssign - represent the assignment of the return value to a location. 1214 SmallVector<CCValAssign, 16> RVLocs; 1215 1216 // CCState - Info about the registers and stack slots. 1217 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1218 getTargetMachine(), RVLocs, *DAG.getContext()); 1219 1220 // Analyze outgoing return values. 1221 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv)); 1222 1223 SDValue Flag; 1224 SmallVector<SDValue, 4> RetOps(1, Chain); 1225 1226 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1227 // PCS: "If the type, T, of the result of a function is such that 1228 // void func(T arg) would require that arg be passed as a value in a 1229 // register (or set of registers) according to the rules in 5.4, then the 1230 // result is returned in the same registers as would be used for such an 1231 // argument. 1232 // 1233 // Otherwise, the caller shall reserve a block of memory of sufficient 1234 // size and alignment to hold the result. The address of the memory block 1235 // shall be passed as an additional argument to the function in x8." 1236 // 1237 // This is implemented in two places. The register-return values are dealt 1238 // with here, more complex returns are passed as an sret parameter, which 1239 // means we don't have to worry about it during actual return. 1240 CCValAssign &VA = RVLocs[i]; 1241 assert(VA.isRegLoc() && "Only register-returns should be created by PCS"); 1242 1243 1244 SDValue Arg = OutVals[i]; 1245 1246 // There's no convenient note in the ABI about this as there is for normal 1247 // arguments, but it says return values are passed in the same registers as 1248 // an argument would be. I believe that includes the comments about 1249 // unspecified higher bits, putting the burden of widening on the *caller* 1250 // for return values. 1251 switch (VA.getLocInfo()) { 1252 default: llvm_unreachable("Unknown loc info"); 1253 case CCValAssign::Full: break; 1254 case CCValAssign::SExt: 1255 case CCValAssign::ZExt: 1256 case CCValAssign::AExt: 1257 // Floating-point values should only be extended when they're going into 1258 // memory, which can't happen here so an integer extend is acceptable. 1259 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1260 break; 1261 case CCValAssign::BCvt: 1262 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1263 break; 1264 } 1265 1266 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 1267 Flag = Chain.getValue(1); 1268 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1269 } 1270 1271 RetOps[0] = Chain; // Update chain. 1272 1273 // Add the flag if we have it. 1274 if (Flag.getNode()) 1275 RetOps.push_back(Flag); 1276 1277 return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other, 1278 &RetOps[0], RetOps.size()); 1279} 1280 1281SDValue 1282AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 1283 SmallVectorImpl<SDValue> &InVals) const { 1284 SelectionDAG &DAG = CLI.DAG; 1285 SDLoc &dl = CLI.DL; 1286 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1287 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1288 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1289 SDValue Chain = CLI.Chain; 1290 SDValue Callee = CLI.Callee; 1291 bool &IsTailCall = CLI.IsTailCall; 1292 CallingConv::ID CallConv = CLI.CallConv; 1293 bool IsVarArg = CLI.IsVarArg; 1294 1295 MachineFunction &MF = DAG.getMachineFunction(); 1296 AArch64MachineFunctionInfo *FuncInfo 1297 = MF.getInfo<AArch64MachineFunctionInfo>(); 1298 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 1299 bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet(); 1300 bool IsSibCall = false; 1301 1302 if (IsTailCall) { 1303 IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1304 IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1305 Outs, OutVals, Ins, DAG); 1306 1307 // A sibling call is one where we're under the usual C ABI and not planning 1308 // to change that but can still do a tail call: 1309 if (!TailCallOpt && IsTailCall) 1310 IsSibCall = true; 1311 } 1312 1313 SmallVector<CCValAssign, 16> ArgLocs; 1314 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1315 getTargetMachine(), ArgLocs, *DAG.getContext()); 1316 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1317 1318 // On AArch64 (and all other architectures I'm aware of) the most this has to 1319 // do is adjust the stack pointer. 1320 unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16); 1321 if (IsSibCall) { 1322 // Since we're not changing the ABI to make this a tail call, the memory 1323 // operands are already available in the caller's incoming argument space. 1324 NumBytes = 0; 1325 } 1326 1327 // FPDiff is the byte offset of the call's argument area from the callee's. 1328 // Stores to callee stack arguments will be placed in FixedStackSlots offset 1329 // by this amount for a tail call. In a sibling call it must be 0 because the 1330 // caller will deallocate the entire stack and the callee still expects its 1331 // arguments to begin at SP+0. Completely unused for non-tail calls. 1332 int FPDiff = 0; 1333 1334 if (IsTailCall && !IsSibCall) { 1335 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 1336 1337 // FPDiff will be negative if this tail call requires more space than we 1338 // would automatically have in our incoming argument space. Positive if we 1339 // can actually shrink the stack. 1340 FPDiff = NumReusableBytes - NumBytes; 1341 1342 // The stack pointer must be 16-byte aligned at all times it's used for a 1343 // memory operation, which in practice means at *all* times and in 1344 // particular across call boundaries. Therefore our own arguments started at 1345 // a 16-byte aligned SP and the delta applied for the tail call should 1346 // satisfy the same constraint. 1347 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 1348 } 1349 1350 if (!IsSibCall) 1351 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 1352 dl); 1353 1354 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, 1355 getPointerTy()); 1356 1357 SmallVector<SDValue, 8> MemOpChains; 1358 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1359 1360 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1361 CCValAssign &VA = ArgLocs[i]; 1362 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1363 SDValue Arg = OutVals[i]; 1364 1365 // Callee does the actual widening, so all extensions just use an implicit 1366 // definition of the rest of the Loc. Aesthetically, this would be nicer as 1367 // an ANY_EXTEND, but that isn't valid for floating-point types and this 1368 // alternative works on integer types too. 1369 switch (VA.getLocInfo()) { 1370 default: llvm_unreachable("Unknown loc info!"); 1371 case CCValAssign::Full: break; 1372 case CCValAssign::SExt: 1373 case CCValAssign::ZExt: 1374 case CCValAssign::AExt: { 1375 unsigned SrcSize = VA.getValVT().getSizeInBits(); 1376 unsigned SrcSubReg; 1377 1378 switch (SrcSize) { 1379 case 8: SrcSubReg = AArch64::sub_8; break; 1380 case 16: SrcSubReg = AArch64::sub_16; break; 1381 case 32: SrcSubReg = AArch64::sub_32; break; 1382 case 64: SrcSubReg = AArch64::sub_64; break; 1383 default: llvm_unreachable("Unexpected argument promotion"); 1384 } 1385 1386 Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, 1387 VA.getLocVT(), 1388 DAG.getUNDEF(VA.getLocVT()), 1389 Arg, 1390 DAG.getTargetConstant(SrcSubReg, MVT::i32)), 1391 0); 1392 1393 break; 1394 } 1395 case CCValAssign::BCvt: 1396 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1397 break; 1398 } 1399 1400 if (VA.isRegLoc()) { 1401 // A normal register (sub-) argument. For now we just note it down because 1402 // we want to copy things into registers as late as possible to avoid 1403 // register-pressure (and possibly worse). 1404 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1405 continue; 1406 } 1407 1408 assert(VA.isMemLoc() && "unexpected argument location"); 1409 1410 SDValue DstAddr; 1411 MachinePointerInfo DstInfo; 1412 if (IsTailCall) { 1413 uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() : 1414 VA.getLocVT().getSizeInBits(); 1415 OpSize = (OpSize + 7) / 8; 1416 int32_t Offset = VA.getLocMemOffset() + FPDiff; 1417 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 1418 1419 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 1420 DstInfo = MachinePointerInfo::getFixedStack(FI); 1421 1422 // Make sure any stack arguments overlapping with where we're storing are 1423 // loaded before this eventual operation. Otherwise they'll be clobbered. 1424 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 1425 } else { 1426 SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1427 1428 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1429 DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset()); 1430 } 1431 1432 if (Flags.isByVal()) { 1433 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64); 1434 SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode, 1435 Flags.getByValAlign(), 1436 /*isVolatile = */ false, 1437 /*alwaysInline = */ false, 1438 DstInfo, MachinePointerInfo(0)); 1439 MemOpChains.push_back(Cpy); 1440 } else { 1441 // Normal stack argument, put it where it's needed. 1442 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo, 1443 false, false, 0); 1444 MemOpChains.push_back(Store); 1445 } 1446 } 1447 1448 // The loads and stores generated above shouldn't clash with each 1449 // other. Combining them with this TokenFactor notes that fact for the rest of 1450 // the backend. 1451 if (!MemOpChains.empty()) 1452 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1453 &MemOpChains[0], MemOpChains.size()); 1454 1455 // Most of the rest of the instructions need to be glued together; we don't 1456 // want assignments to actual registers used by a call to be rearranged by a 1457 // well-meaning scheduler. 1458 SDValue InFlag; 1459 1460 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1461 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1462 RegsToPass[i].second, InFlag); 1463 InFlag = Chain.getValue(1); 1464 } 1465 1466 // The linker is responsible for inserting veneers when necessary to put a 1467 // function call destination in range, so we don't need to bother with a 1468 // wrapper here. 1469 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1470 const GlobalValue *GV = G->getGlobal(); 1471 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); 1472 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1473 const char *Sym = S->getSymbol(); 1474 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 1475 } 1476 1477 // We don't usually want to end the call-sequence here because we would tidy 1478 // the frame up *after* the call, however in the ABI-changing tail-call case 1479 // we've carefully laid out the parameters so that when sp is reset they'll be 1480 // in the correct location. 1481 if (IsTailCall && !IsSibCall) { 1482 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1483 DAG.getIntPtrConstant(0, true), InFlag, dl); 1484 InFlag = Chain.getValue(1); 1485 } 1486 1487 // We produce the following DAG scheme for the actual call instruction: 1488 // (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag? 1489 // 1490 // Most arguments aren't going to be used and just keep the values live as 1491 // far as LLVM is concerned. It's expected to be selected as simply "bl 1492 // callee" (for a direct, non-tail call). 1493 std::vector<SDValue> Ops; 1494 Ops.push_back(Chain); 1495 Ops.push_back(Callee); 1496 1497 if (IsTailCall) { 1498 // Each tail call may have to adjust the stack by a different amount, so 1499 // this information must travel along with the operation for eventual 1500 // consumption by emitEpilogue. 1501 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 1502 } 1503 1504 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1505 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1506 RegsToPass[i].second.getValueType())); 1507 1508 1509 // Add a register mask operand representing the call-preserved registers. This 1510 // is used later in codegen to constrain register-allocation. 1511 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1512 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 1513 assert(Mask && "Missing call preserved mask for calling convention"); 1514 Ops.push_back(DAG.getRegisterMask(Mask)); 1515 1516 // If we needed glue, put it in as the last argument. 1517 if (InFlag.getNode()) 1518 Ops.push_back(InFlag); 1519 1520 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1521 1522 if (IsTailCall) { 1523 return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 1524 } 1525 1526 Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size()); 1527 InFlag = Chain.getValue(1); 1528 1529 // Now we can reclaim the stack, just as well do it before working out where 1530 // our return value is. 1531 if (!IsSibCall) { 1532 uint64_t CalleePopBytes 1533 = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0; 1534 1535 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1536 DAG.getIntPtrConstant(CalleePopBytes, true), 1537 InFlag, dl); 1538 InFlag = Chain.getValue(1); 1539 } 1540 1541 return LowerCallResult(Chain, InFlag, CallConv, 1542 IsVarArg, Ins, dl, DAG, InVals); 1543} 1544 1545SDValue 1546AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1547 CallingConv::ID CallConv, bool IsVarArg, 1548 const SmallVectorImpl<ISD::InputArg> &Ins, 1549 SDLoc dl, SelectionDAG &DAG, 1550 SmallVectorImpl<SDValue> &InVals) const { 1551 // Assign locations to each value returned by this call. 1552 SmallVector<CCValAssign, 16> RVLocs; 1553 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1554 getTargetMachine(), RVLocs, *DAG.getContext()); 1555 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv)); 1556 1557 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1558 CCValAssign VA = RVLocs[i]; 1559 1560 // Return values that are too big to fit into registers should use an sret 1561 // pointer, so this can be a lot simpler than the main argument code. 1562 assert(VA.isRegLoc() && "Memory locations not expected for call return"); 1563 1564 SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1565 InFlag); 1566 Chain = Val.getValue(1); 1567 InFlag = Val.getValue(2); 1568 1569 switch (VA.getLocInfo()) { 1570 default: llvm_unreachable("Unknown loc info!"); 1571 case CCValAssign::Full: break; 1572 case CCValAssign::BCvt: 1573 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1574 break; 1575 case CCValAssign::ZExt: 1576 case CCValAssign::SExt: 1577 case CCValAssign::AExt: 1578 // Floating-point arguments only get extended/truncated if they're going 1579 // in memory, so using the integer operation is acceptable here. 1580 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1581 break; 1582 } 1583 1584 InVals.push_back(Val); 1585 } 1586 1587 return Chain; 1588} 1589 1590bool 1591AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 1592 CallingConv::ID CalleeCC, 1593 bool IsVarArg, 1594 bool IsCalleeStructRet, 1595 bool IsCallerStructRet, 1596 const SmallVectorImpl<ISD::OutputArg> &Outs, 1597 const SmallVectorImpl<SDValue> &OutVals, 1598 const SmallVectorImpl<ISD::InputArg> &Ins, 1599 SelectionDAG& DAG) const { 1600 1601 // For CallingConv::C this function knows whether the ABI needs 1602 // changing. That's not true for other conventions so they will have to opt in 1603 // manually. 1604 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1605 return false; 1606 1607 const MachineFunction &MF = DAG.getMachineFunction(); 1608 const Function *CallerF = MF.getFunction(); 1609 CallingConv::ID CallerCC = CallerF->getCallingConv(); 1610 bool CCMatch = CallerCC == CalleeCC; 1611 1612 // Byval parameters hand the function a pointer directly into the stack area 1613 // we want to reuse during a tail call. Working around this *is* possible (see 1614 // X86) but less efficient and uglier in LowerCall. 1615 for (Function::const_arg_iterator i = CallerF->arg_begin(), 1616 e = CallerF->arg_end(); i != e; ++i) 1617 if (i->hasByValAttr()) 1618 return false; 1619 1620 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 1621 if (IsTailCallConvention(CalleeCC) && CCMatch) 1622 return true; 1623 return false; 1624 } 1625 1626 // Now we search for cases where we can use a tail call without changing the 1627 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 1628 // concept. 1629 1630 // I want anyone implementing a new calling convention to think long and hard 1631 // about this assert. 1632 assert((!IsVarArg || CalleeCC == CallingConv::C) 1633 && "Unexpected variadic calling convention"); 1634 1635 if (IsVarArg && !Outs.empty()) { 1636 // At least two cases here: if caller is fastcc then we can't have any 1637 // memory arguments (we'd be expected to clean up the stack afterwards). If 1638 // caller is C then we could potentially use its argument area. 1639 1640 // FIXME: for now we take the most conservative of these in both cases: 1641 // disallow all variadic memory operands. 1642 SmallVector<CCValAssign, 16> ArgLocs; 1643 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1644 getTargetMachine(), ArgLocs, *DAG.getContext()); 1645 1646 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1647 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 1648 if (!ArgLocs[i].isRegLoc()) 1649 return false; 1650 } 1651 1652 // If the calling conventions do not match, then we'd better make sure the 1653 // results are returned in the same way as what the caller expects. 1654 if (!CCMatch) { 1655 SmallVector<CCValAssign, 16> RVLocs1; 1656 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 1657 getTargetMachine(), RVLocs1, *DAG.getContext()); 1658 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC)); 1659 1660 SmallVector<CCValAssign, 16> RVLocs2; 1661 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 1662 getTargetMachine(), RVLocs2, *DAG.getContext()); 1663 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC)); 1664 1665 if (RVLocs1.size() != RVLocs2.size()) 1666 return false; 1667 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 1668 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 1669 return false; 1670 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 1671 return false; 1672 if (RVLocs1[i].isRegLoc()) { 1673 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 1674 return false; 1675 } else { 1676 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 1677 return false; 1678 } 1679 } 1680 } 1681 1682 // Nothing more to check if the callee is taking no arguments 1683 if (Outs.empty()) 1684 return true; 1685 1686 SmallVector<CCValAssign, 16> ArgLocs; 1687 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1688 getTargetMachine(), ArgLocs, *DAG.getContext()); 1689 1690 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1691 1692 const AArch64MachineFunctionInfo *FuncInfo 1693 = MF.getInfo<AArch64MachineFunctionInfo>(); 1694 1695 // If the stack arguments for this call would fit into our own save area then 1696 // the call can be made tail. 1697 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 1698} 1699 1700bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 1701 bool TailCallOpt) const { 1702 return CallCC == CallingConv::Fast && TailCallOpt; 1703} 1704 1705bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 1706 return CallCC == CallingConv::Fast; 1707} 1708 1709SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 1710 SelectionDAG &DAG, 1711 MachineFrameInfo *MFI, 1712 int ClobberedFI) const { 1713 SmallVector<SDValue, 8> ArgChains; 1714 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 1715 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 1716 1717 // Include the original chain at the beginning of the list. When this is 1718 // used by target LowerCall hooks, this helps legalize find the 1719 // CALLSEQ_BEGIN node. 1720 ArgChains.push_back(Chain); 1721 1722 // Add a chain value for each stack argument corresponding 1723 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1724 UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) 1725 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 1726 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 1727 if (FI->getIndex() < 0) { 1728 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 1729 int64_t InLastByte = InFirstByte; 1730 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 1731 1732 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1733 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1734 ArgChains.push_back(SDValue(L, 1)); 1735 } 1736 1737 // Build a tokenfactor for all the chains. 1738 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, 1739 &ArgChains[0], ArgChains.size()); 1740} 1741 1742static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) { 1743 switch (CC) { 1744 case ISD::SETEQ: return A64CC::EQ; 1745 case ISD::SETGT: return A64CC::GT; 1746 case ISD::SETGE: return A64CC::GE; 1747 case ISD::SETLT: return A64CC::LT; 1748 case ISD::SETLE: return A64CC::LE; 1749 case ISD::SETNE: return A64CC::NE; 1750 case ISD::SETUGT: return A64CC::HI; 1751 case ISD::SETUGE: return A64CC::HS; 1752 case ISD::SETULT: return A64CC::LO; 1753 case ISD::SETULE: return A64CC::LS; 1754 default: llvm_unreachable("Unexpected condition code"); 1755 } 1756} 1757 1758bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { 1759 // icmp is implemented using adds/subs immediate, which take an unsigned 1760 // 12-bit immediate, optionally shifted left by 12 bits. 1761 1762 // Symmetric by using adds/subs 1763 if (Val < 0) 1764 Val = -Val; 1765 1766 return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0; 1767} 1768 1769SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, 1770 ISD::CondCode CC, SDValue &A64cc, 1771 SelectionDAG &DAG, SDLoc &dl) const { 1772 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1773 int64_t C = 0; 1774 EVT VT = RHSC->getValueType(0); 1775 bool knownInvalid = false; 1776 1777 // I'm not convinced the rest of LLVM handles these edge cases properly, but 1778 // we can at least get it right. 1779 if (isSignedIntSetCC(CC)) { 1780 C = RHSC->getSExtValue(); 1781 } else if (RHSC->getZExtValue() > INT64_MAX) { 1782 // A 64-bit constant not representable by a signed 64-bit integer is far 1783 // too big to fit into a SUBS immediate anyway. 1784 knownInvalid = true; 1785 } else { 1786 C = RHSC->getZExtValue(); 1787 } 1788 1789 if (!knownInvalid && !isLegalICmpImmediate(C)) { 1790 // Constant does not fit, try adjusting it by one? 1791 switch (CC) { 1792 default: break; 1793 case ISD::SETLT: 1794 case ISD::SETGE: 1795 if (isLegalICmpImmediate(C-1)) { 1796 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1797 RHS = DAG.getConstant(C-1, VT); 1798 } 1799 break; 1800 case ISD::SETULT: 1801 case ISD::SETUGE: 1802 if (isLegalICmpImmediate(C-1)) { 1803 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1804 RHS = DAG.getConstant(C-1, VT); 1805 } 1806 break; 1807 case ISD::SETLE: 1808 case ISD::SETGT: 1809 if (isLegalICmpImmediate(C+1)) { 1810 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1811 RHS = DAG.getConstant(C+1, VT); 1812 } 1813 break; 1814 case ISD::SETULE: 1815 case ISD::SETUGT: 1816 if (isLegalICmpImmediate(C+1)) { 1817 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1818 RHS = DAG.getConstant(C+1, VT); 1819 } 1820 break; 1821 } 1822 } 1823 } 1824 1825 A64CC::CondCodes CondCode = IntCCToA64CC(CC); 1826 A64cc = DAG.getConstant(CondCode, MVT::i32); 1827 return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1828 DAG.getCondCode(CC)); 1829} 1830 1831static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, 1832 A64CC::CondCodes &Alternative) { 1833 A64CC::CondCodes CondCode = A64CC::Invalid; 1834 Alternative = A64CC::Invalid; 1835 1836 switch (CC) { 1837 default: llvm_unreachable("Unknown FP condition!"); 1838 case ISD::SETEQ: 1839 case ISD::SETOEQ: CondCode = A64CC::EQ; break; 1840 case ISD::SETGT: 1841 case ISD::SETOGT: CondCode = A64CC::GT; break; 1842 case ISD::SETGE: 1843 case ISD::SETOGE: CondCode = A64CC::GE; break; 1844 case ISD::SETOLT: CondCode = A64CC::MI; break; 1845 case ISD::SETOLE: CondCode = A64CC::LS; break; 1846 case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break; 1847 case ISD::SETO: CondCode = A64CC::VC; break; 1848 case ISD::SETUO: CondCode = A64CC::VS; break; 1849 case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break; 1850 case ISD::SETUGT: CondCode = A64CC::HI; break; 1851 case ISD::SETUGE: CondCode = A64CC::PL; break; 1852 case ISD::SETLT: 1853 case ISD::SETULT: CondCode = A64CC::LT; break; 1854 case ISD::SETLE: 1855 case ISD::SETULE: CondCode = A64CC::LE; break; 1856 case ISD::SETNE: 1857 case ISD::SETUNE: CondCode = A64CC::NE; break; 1858 } 1859 return CondCode; 1860} 1861 1862SDValue 1863AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 1864 SDLoc DL(Op); 1865 EVT PtrVT = getPointerTy(); 1866 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 1867 1868 switch(getTargetMachine().getCodeModel()) { 1869 case CodeModel::Small: 1870 // The most efficient code is PC-relative anyway for the small memory model, 1871 // so we don't need to worry about relocation model. 1872 return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 1873 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1874 AArch64II::MO_NO_FLAG), 1875 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1876 AArch64II::MO_LO12), 1877 DAG.getConstant(/*Alignment=*/ 4, MVT::i32)); 1878 case CodeModel::Large: 1879 return DAG.getNode( 1880 AArch64ISD::WrapperLarge, DL, PtrVT, 1881 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3), 1882 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 1883 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 1884 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 1885 default: 1886 llvm_unreachable("Only small and large code models supported now"); 1887 } 1888} 1889 1890 1891// (BRCOND chain, val, dest) 1892SDValue 1893AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1894 SDLoc dl(Op); 1895 SDValue Chain = Op.getOperand(0); 1896 SDValue TheBit = Op.getOperand(1); 1897 SDValue DestBB = Op.getOperand(2); 1898 1899 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 1900 // that as the consumer we are responsible for ignoring rubbish in higher 1901 // bits. 1902 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 1903 DAG.getConstant(1, MVT::i32)); 1904 1905 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 1906 DAG.getConstant(0, TheBit.getValueType()), 1907 DAG.getCondCode(ISD::SETNE)); 1908 1909 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain, 1910 A64CMP, DAG.getConstant(A64CC::NE, MVT::i32), 1911 DestBB); 1912} 1913 1914// (BR_CC chain, condcode, lhs, rhs, dest) 1915SDValue 1916AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 1917 SDLoc dl(Op); 1918 SDValue Chain = Op.getOperand(0); 1919 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 1920 SDValue LHS = Op.getOperand(2); 1921 SDValue RHS = Op.getOperand(3); 1922 SDValue DestBB = Op.getOperand(4); 1923 1924 if (LHS.getValueType() == MVT::f128) { 1925 // f128 comparisons are lowered to runtime calls by a routine which sets 1926 // LHS, RHS and CC appropriately for the rest of this function to continue. 1927 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 1928 1929 // If softenSetCCOperands returned a scalar, we need to compare the result 1930 // against zero to select between true and false values. 1931 if (RHS.getNode() == 0) { 1932 RHS = DAG.getConstant(0, LHS.getValueType()); 1933 CC = ISD::SETNE; 1934 } 1935 } 1936 1937 if (LHS.getValueType().isInteger()) { 1938 SDValue A64cc; 1939 1940 // Integers are handled in a separate function because the combinations of 1941 // immediates and tests can get hairy and we may want to fiddle things. 1942 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 1943 1944 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1945 Chain, CmpOp, A64cc, DestBB); 1946 } 1947 1948 // Note that some LLVM floating-point CondCodes can't be lowered to a single 1949 // conditional branch, hence FPCCToA64CC can set a second test, where either 1950 // passing is sufficient. 1951 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 1952 CondCode = FPCCToA64CC(CC, Alternative); 1953 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 1954 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1955 DAG.getCondCode(CC)); 1956 SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1957 Chain, SetCC, A64cc, DestBB); 1958 1959 if (Alternative != A64CC::Invalid) { 1960 A64cc = DAG.getConstant(Alternative, MVT::i32); 1961 A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1962 A64BR_CC, SetCC, A64cc, DestBB); 1963 1964 } 1965 1966 return A64BR_CC; 1967} 1968 1969SDValue 1970AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, 1971 RTLIB::Libcall Call) const { 1972 ArgListTy Args; 1973 ArgListEntry Entry; 1974 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 1975 EVT ArgVT = Op.getOperand(i).getValueType(); 1976 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1977 Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; 1978 Entry.isSExt = false; 1979 Entry.isZExt = false; 1980 Args.push_back(Entry); 1981 } 1982 SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy()); 1983 1984 Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext()); 1985 1986 // By default, the input chain to this libcall is the entry node of the 1987 // function. If the libcall is going to be emitted as a tail call then 1988 // isUsedByReturnOnly will change it to the right chain if the return 1989 // node which is being folded has a non-entry input chain. 1990 SDValue InChain = DAG.getEntryNode(); 1991 1992 // isTailCall may be true since the callee does not reference caller stack 1993 // frame. Check if it's in the right position. 1994 SDValue TCChain = InChain; 1995 bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain); 1996 if (isTailCall) 1997 InChain = TCChain; 1998 1999 TargetLowering:: 2000 CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 2001 0, getLibcallCallingConv(Call), isTailCall, 2002 /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, 2003 Callee, Args, DAG, SDLoc(Op)); 2004 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 2005 2006 if (!CallInfo.second.getNode()) 2007 // It's a tailcall, return the chain (which is the DAG root). 2008 return DAG.getRoot(); 2009 2010 return CallInfo.first; 2011} 2012 2013SDValue 2014AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 2015 if (Op.getOperand(0).getValueType() != MVT::f128) { 2016 // It's legal except when f128 is involved 2017 return Op; 2018 } 2019 2020 RTLIB::Libcall LC; 2021 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 2022 2023 SDValue SrcVal = Op.getOperand(0); 2024 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 2025 /*isSigned*/ false, SDLoc(Op)).first; 2026} 2027 2028SDValue 2029AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 2030 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 2031 2032 RTLIB::Libcall LC; 2033 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 2034 2035 return LowerF128ToCall(Op, DAG, LC); 2036} 2037 2038SDValue 2039AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 2040 bool IsSigned) const { 2041 if (Op.getOperand(0).getValueType() != MVT::f128) { 2042 // It's legal except when f128 is involved 2043 return Op; 2044 } 2045 2046 RTLIB::Libcall LC; 2047 if (IsSigned) 2048 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 2049 else 2050 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 2051 2052 return LowerF128ToCall(Op, DAG, LC); 2053} 2054 2055SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 2056 MachineFunction &MF = DAG.getMachineFunction(); 2057 MachineFrameInfo *MFI = MF.getFrameInfo(); 2058 MFI->setReturnAddressIsTaken(true); 2059 2060 EVT VT = Op.getValueType(); 2061 SDLoc dl(Op); 2062 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2063 if (Depth) { 2064 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 2065 SDValue Offset = DAG.getConstant(8, MVT::i64); 2066 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 2067 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 2068 MachinePointerInfo(), false, false, false, 0); 2069 } 2070 2071 // Return X30, which contains the return address. Mark it an implicit live-in. 2072 unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64)); 2073 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64); 2074} 2075 2076 2077SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) 2078 const { 2079 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 2080 MFI->setFrameAddressIsTaken(true); 2081 2082 EVT VT = Op.getValueType(); 2083 SDLoc dl(Op); 2084 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 2085 unsigned FrameReg = AArch64::X29; 2086 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 2087 while (Depth--) 2088 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 2089 MachinePointerInfo(), 2090 false, false, false, 0); 2091 return FrameAddr; 2092} 2093 2094SDValue 2095AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, 2096 SelectionDAG &DAG) const { 2097 assert(getTargetMachine().getCodeModel() == CodeModel::Large); 2098 assert(getTargetMachine().getRelocationModel() == Reloc::Static); 2099 2100 EVT PtrVT = getPointerTy(); 2101 SDLoc dl(Op); 2102 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 2103 const GlobalValue *GV = GN->getGlobal(); 2104 2105 SDValue GlobalAddr = DAG.getNode( 2106 AArch64ISD::WrapperLarge, dl, PtrVT, 2107 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3), 2108 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 2109 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 2110 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 2111 2112 if (GN->getOffset() != 0) 2113 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 2114 DAG.getConstant(GN->getOffset(), PtrVT)); 2115 2116 return GlobalAddr; 2117} 2118 2119SDValue 2120AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, 2121 SelectionDAG &DAG) const { 2122 assert(getTargetMachine().getCodeModel() == CodeModel::Small); 2123 2124 EVT PtrVT = getPointerTy(); 2125 SDLoc dl(Op); 2126 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 2127 const GlobalValue *GV = GN->getGlobal(); 2128 unsigned Alignment = GV->getAlignment(); 2129 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 2130 if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) { 2131 // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate 2132 // to zero when they remain undefined. In PIC mode the GOT can take care of 2133 // this, but in absolute mode we use a constant pool load. 2134 SDValue PoolAddr; 2135 PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 2136 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 2137 AArch64II::MO_NO_FLAG), 2138 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 2139 AArch64II::MO_LO12), 2140 DAG.getConstant(8, MVT::i32)); 2141 SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr, 2142 MachinePointerInfo::getConstantPool(), 2143 /*isVolatile=*/ false, 2144 /*isNonTemporal=*/ true, 2145 /*isInvariant=*/ true, 8); 2146 if (GN->getOffset() != 0) 2147 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 2148 DAG.getConstant(GN->getOffset(), PtrVT)); 2149 2150 return GlobalAddr; 2151 } 2152 2153 if (Alignment == 0) { 2154 const PointerType *GVPtrTy = cast<PointerType>(GV->getType()); 2155 if (GVPtrTy->getElementType()->isSized()) { 2156 Alignment 2157 = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType()); 2158 } else { 2159 // Be conservative if we can't guess, not that it really matters: 2160 // functions and labels aren't valid for loads, and the methods used to 2161 // actually calculate an address work with any alignment. 2162 Alignment = 1; 2163 } 2164 } 2165 2166 unsigned char HiFixup, LoFixup; 2167 bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM); 2168 2169 if (UseGOT) { 2170 HiFixup = AArch64II::MO_GOT; 2171 LoFixup = AArch64II::MO_GOT_LO12; 2172 Alignment = 8; 2173 } else { 2174 HiFixup = AArch64II::MO_NO_FLAG; 2175 LoFixup = AArch64II::MO_LO12; 2176 } 2177 2178 // AArch64's small model demands the following sequence: 2179 // ADRP x0, somewhere 2180 // ADD x0, x0, #:lo12:somewhere ; (or LDR directly). 2181 SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 2182 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2183 HiFixup), 2184 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 2185 LoFixup), 2186 DAG.getConstant(Alignment, MVT::i32)); 2187 2188 if (UseGOT) { 2189 GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(), 2190 GlobalRef); 2191 } 2192 2193 if (GN->getOffset() != 0) 2194 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef, 2195 DAG.getConstant(GN->getOffset(), PtrVT)); 2196 2197 return GlobalRef; 2198} 2199 2200SDValue 2201AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, 2202 SelectionDAG &DAG) const { 2203 // TableGen doesn't have easy access to the CodeModel or RelocationModel, so 2204 // we make those distinctions here. 2205 2206 switch (getTargetMachine().getCodeModel()) { 2207 case CodeModel::Small: 2208 return LowerGlobalAddressELFSmall(Op, DAG); 2209 case CodeModel::Large: 2210 return LowerGlobalAddressELFLarge(Op, DAG); 2211 default: 2212 llvm_unreachable("Only small and large code models supported now"); 2213 } 2214} 2215 2216SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, 2217 SDValue DescAddr, 2218 SDLoc DL, 2219 SelectionDAG &DAG) const { 2220 EVT PtrVT = getPointerTy(); 2221 2222 // The function we need to call is simply the first entry in the GOT for this 2223 // descriptor, load it in preparation. 2224 SDValue Func, Chain; 2225 Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 2226 DescAddr); 2227 2228 // The function takes only one argument: the address of the descriptor itself 2229 // in X0. 2230 SDValue Glue; 2231 Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); 2232 Glue = Chain.getValue(1); 2233 2234 // Finally, there's a special calling-convention which means that the lookup 2235 // must preserve all registers (except X0, obviously). 2236 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2237 const AArch64RegisterInfo *A64RI 2238 = static_cast<const AArch64RegisterInfo *>(TRI); 2239 const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask(); 2240 2241 // We're now ready to populate the argument list, as with a normal call: 2242 std::vector<SDValue> Ops; 2243 Ops.push_back(Chain); 2244 Ops.push_back(Func); 2245 Ops.push_back(SymAddr); 2246 Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); 2247 Ops.push_back(DAG.getRegisterMask(Mask)); 2248 Ops.push_back(Glue); 2249 2250 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2251 Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0], 2252 Ops.size()); 2253 Glue = Chain.getValue(1); 2254 2255 // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it 2256 // back to the generic handling code. 2257 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 2258} 2259 2260SDValue 2261AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 2262 SelectionDAG &DAG) const { 2263 assert(getSubtarget()->isTargetELF() && 2264 "TLS not implemented for non-ELF targets"); 2265 assert(getTargetMachine().getCodeModel() == CodeModel::Small 2266 && "TLS only supported in small memory model"); 2267 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2268 2269 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 2270 2271 SDValue TPOff; 2272 EVT PtrVT = getPointerTy(); 2273 SDLoc DL(Op); 2274 const GlobalValue *GV = GA->getGlobal(); 2275 2276 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 2277 2278 if (Model == TLSModel::InitialExec) { 2279 TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2280 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2281 AArch64II::MO_GOTTPREL), 2282 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2283 AArch64II::MO_GOTTPREL_LO12), 2284 DAG.getConstant(8, MVT::i32)); 2285 TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 2286 TPOff); 2287 } else if (Model == TLSModel::LocalExec) { 2288 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2289 AArch64II::MO_TPREL_G1); 2290 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2291 AArch64II::MO_TPREL_G0_NC); 2292 2293 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2294 DAG.getTargetConstant(1, MVT::i32)), 0); 2295 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2296 TPOff, LoVar, 2297 DAG.getTargetConstant(0, MVT::i32)), 0); 2298 } else if (Model == TLSModel::GeneralDynamic) { 2299 // Accesses used in this sequence go via the TLS descriptor which lives in 2300 // the GOT. Prepare an address we can use to handle this. 2301 SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2302 AArch64II::MO_TLSDESC); 2303 SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2304 AArch64II::MO_TLSDESC_LO12); 2305 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2306 HiDesc, LoDesc, 2307 DAG.getConstant(8, MVT::i32)); 2308 SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0); 2309 2310 TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2311 } else if (Model == TLSModel::LocalDynamic) { 2312 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 2313 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 2314 // the beginning of the module's TLS region, followed by a DTPREL offset 2315 // calculation. 2316 2317 // These accesses will need deduplicating if there's more than one. 2318 AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction() 2319 .getInfo<AArch64MachineFunctionInfo>(); 2320 MFI->incNumLocalDynamicTLSAccesses(); 2321 2322 2323 // Get the location of _TLS_MODULE_BASE_: 2324 SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2325 AArch64II::MO_TLSDESC); 2326 SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2327 AArch64II::MO_TLSDESC_LO12); 2328 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2329 HiDesc, LoDesc, 2330 DAG.getConstant(8, MVT::i32)); 2331 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT); 2332 2333 ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2334 2335 // Get the variable's offset from _TLS_MODULE_BASE_ 2336 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2337 AArch64II::MO_DTPREL_G1); 2338 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2339 AArch64II::MO_DTPREL_G0_NC); 2340 2341 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2342 DAG.getTargetConstant(0, MVT::i32)), 0); 2343 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2344 TPOff, LoVar, 2345 DAG.getTargetConstant(0, MVT::i32)), 0); 2346 } else 2347 llvm_unreachable("Unsupported TLS access model"); 2348 2349 2350 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 2351} 2352 2353SDValue 2354AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, 2355 bool IsSigned) const { 2356 if (Op.getValueType() != MVT::f128) { 2357 // Legal for everything except f128. 2358 return Op; 2359 } 2360 2361 RTLIB::Libcall LC; 2362 if (IsSigned) 2363 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2364 else 2365 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2366 2367 return LowerF128ToCall(Op, DAG, LC); 2368} 2369 2370 2371SDValue 2372AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2373 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2374 SDLoc dl(JT); 2375 EVT PtrVT = getPointerTy(); 2376 2377 // When compiling PIC, jump tables get put in the code section so a static 2378 // relocation-style is acceptable for both cases. 2379 switch (getTargetMachine().getCodeModel()) { 2380 case CodeModel::Small: 2381 return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 2382 DAG.getTargetJumpTable(JT->getIndex(), PtrVT), 2383 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2384 AArch64II::MO_LO12), 2385 DAG.getConstant(1, MVT::i32)); 2386 case CodeModel::Large: 2387 return DAG.getNode( 2388 AArch64ISD::WrapperLarge, dl, PtrVT, 2389 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3), 2390 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC), 2391 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC), 2392 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC)); 2393 default: 2394 llvm_unreachable("Only small and large code models supported now"); 2395 } 2396} 2397 2398// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) 2399SDValue 2400AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 2401 SDLoc dl(Op); 2402 SDValue LHS = Op.getOperand(0); 2403 SDValue RHS = Op.getOperand(1); 2404 SDValue IfTrue = Op.getOperand(2); 2405 SDValue IfFalse = Op.getOperand(3); 2406 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 2407 2408 if (LHS.getValueType() == MVT::f128) { 2409 // f128 comparisons are lowered to libcalls, but slot in nicely here 2410 // afterwards. 2411 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2412 2413 // If softenSetCCOperands returned a scalar, we need to compare the result 2414 // against zero to select between true and false values. 2415 if (RHS.getNode() == 0) { 2416 RHS = DAG.getConstant(0, LHS.getValueType()); 2417 CC = ISD::SETNE; 2418 } 2419 } 2420 2421 if (LHS.getValueType().isInteger()) { 2422 SDValue A64cc; 2423 2424 // Integers are handled in a separate function because the combinations of 2425 // immediates and tests can get hairy and we may want to fiddle things. 2426 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2427 2428 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2429 CmpOp, IfTrue, IfFalse, A64cc); 2430 } 2431 2432 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2433 // conditional branch, hence FPCCToA64CC can set a second test, where either 2434 // passing is sufficient. 2435 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2436 CondCode = FPCCToA64CC(CC, Alternative); 2437 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2438 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2439 DAG.getCondCode(CC)); 2440 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, 2441 Op.getValueType(), 2442 SetCC, IfTrue, IfFalse, A64cc); 2443 2444 if (Alternative != A64CC::Invalid) { 2445 A64cc = DAG.getConstant(Alternative, MVT::i32); 2446 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2447 SetCC, IfTrue, A64SELECT_CC, A64cc); 2448 2449 } 2450 2451 return A64SELECT_CC; 2452} 2453 2454// (SELECT testbit, iftrue, iffalse) 2455SDValue 2456AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2457 SDLoc dl(Op); 2458 SDValue TheBit = Op.getOperand(0); 2459 SDValue IfTrue = Op.getOperand(1); 2460 SDValue IfFalse = Op.getOperand(2); 2461 2462 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 2463 // that as the consumer we are responsible for ignoring rubbish in higher 2464 // bits. 2465 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 2466 DAG.getConstant(1, MVT::i32)); 2467 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 2468 DAG.getConstant(0, TheBit.getValueType()), 2469 DAG.getCondCode(ISD::SETNE)); 2470 2471 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2472 A64CMP, IfTrue, IfFalse, 2473 DAG.getConstant(A64CC::NE, MVT::i32)); 2474} 2475 2476static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) { 2477 SDLoc DL(Op); 2478 SDValue LHS = Op.getOperand(0); 2479 SDValue RHS = Op.getOperand(1); 2480 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2481 EVT VT = Op.getValueType(); 2482 bool Invert = false; 2483 SDValue Op0, Op1; 2484 unsigned Opcode; 2485 2486 if (LHS.getValueType().isInteger()) { 2487 2488 // Attempt to use Vector Integer Compare Mask Test instruction. 2489 // TST = icmp ne (and (op0, op1), zero). 2490 if (CC == ISD::SETNE) { 2491 if (((LHS.getOpcode() == ISD::AND) && 2492 ISD::isBuildVectorAllZeros(RHS.getNode())) || 2493 ((RHS.getOpcode() == ISD::AND) && 2494 ISD::isBuildVectorAllZeros(LHS.getNode()))) { 2495 2496 SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS; 2497 SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0)); 2498 SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1)); 2499 return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS); 2500 } 2501 } 2502 2503 // Attempt to use Vector Integer Compare Mask against Zero instr (Signed). 2504 // Note: Compare against Zero does not support unsigned predicates. 2505 if ((ISD::isBuildVectorAllZeros(RHS.getNode()) || 2506 ISD::isBuildVectorAllZeros(LHS.getNode())) && 2507 !isUnsignedIntSetCC(CC)) { 2508 2509 // If LHS is the zero value, swap operands and CondCode. 2510 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 2511 CC = getSetCCSwappedOperands(CC); 2512 Op0 = RHS; 2513 } else 2514 Op0 = LHS; 2515 2516 // Ensure valid CondCode for Compare Mask against Zero instruction: 2517 // EQ, GE, GT, LE, LT. 2518 if (ISD::SETNE == CC) { 2519 Invert = true; 2520 CC = ISD::SETEQ; 2521 } 2522 2523 // Using constant type to differentiate integer and FP compares with zero. 2524 Op1 = DAG.getConstant(0, MVT::i32); 2525 Opcode = AArch64ISD::NEON_CMPZ; 2526 2527 } else { 2528 // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned). 2529 // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT. 2530 bool Swap = false; 2531 switch (CC) { 2532 default: 2533 llvm_unreachable("Illegal integer comparison."); 2534 case ISD::SETEQ: 2535 case ISD::SETGT: 2536 case ISD::SETGE: 2537 case ISD::SETUGT: 2538 case ISD::SETUGE: 2539 break; 2540 case ISD::SETNE: 2541 Invert = true; 2542 CC = ISD::SETEQ; 2543 break; 2544 case ISD::SETULT: 2545 case ISD::SETULE: 2546 case ISD::SETLT: 2547 case ISD::SETLE: 2548 Swap = true; 2549 CC = getSetCCSwappedOperands(CC); 2550 } 2551 2552 if (Swap) 2553 std::swap(LHS, RHS); 2554 2555 Opcode = AArch64ISD::NEON_CMP; 2556 Op0 = LHS; 2557 Op1 = RHS; 2558 } 2559 2560 // Generate Compare Mask instr or Compare Mask against Zero instr. 2561 SDValue NeonCmp = 2562 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); 2563 2564 if (Invert) 2565 NeonCmp = DAG.getNOT(DL, NeonCmp, VT); 2566 2567 return NeonCmp; 2568 } 2569 2570 // Now handle Floating Point cases. 2571 // Attempt to use Vector Floating Point Compare Mask against Zero instruction. 2572 if (ISD::isBuildVectorAllZeros(RHS.getNode()) || 2573 ISD::isBuildVectorAllZeros(LHS.getNode())) { 2574 2575 // If LHS is the zero value, swap operands and CondCode. 2576 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 2577 CC = getSetCCSwappedOperands(CC); 2578 Op0 = RHS; 2579 } else 2580 Op0 = LHS; 2581 2582 // Using constant type to differentiate integer and FP compares with zero. 2583 Op1 = DAG.getConstantFP(0, MVT::f32); 2584 Opcode = AArch64ISD::NEON_CMPZ; 2585 } else { 2586 // Attempt to use Vector Floating Point Compare Mask instruction. 2587 Op0 = LHS; 2588 Op1 = RHS; 2589 Opcode = AArch64ISD::NEON_CMP; 2590 } 2591 2592 SDValue NeonCmpAlt; 2593 // Some register compares have to be implemented with swapped CC and operands, 2594 // e.g.: OLT implemented as OGT with swapped operands. 2595 bool SwapIfRegArgs = false; 2596 2597 // Ensure valid CondCode for FP Compare Mask against Zero instruction: 2598 // EQ, GE, GT, LE, LT. 2599 // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT. 2600 switch (CC) { 2601 default: 2602 llvm_unreachable("Illegal FP comparison"); 2603 case ISD::SETUNE: 2604 case ISD::SETNE: 2605 Invert = true; // Fallthrough 2606 case ISD::SETOEQ: 2607 case ISD::SETEQ: 2608 CC = ISD::SETEQ; 2609 break; 2610 case ISD::SETOLT: 2611 case ISD::SETLT: 2612 CC = ISD::SETLT; 2613 SwapIfRegArgs = true; 2614 break; 2615 case ISD::SETOGT: 2616 case ISD::SETGT: 2617 CC = ISD::SETGT; 2618 break; 2619 case ISD::SETOLE: 2620 case ISD::SETLE: 2621 CC = ISD::SETLE; 2622 SwapIfRegArgs = true; 2623 break; 2624 case ISD::SETOGE: 2625 case ISD::SETGE: 2626 CC = ISD::SETGE; 2627 break; 2628 case ISD::SETUGE: 2629 Invert = true; 2630 CC = ISD::SETLT; 2631 SwapIfRegArgs = true; 2632 break; 2633 case ISD::SETULE: 2634 Invert = true; 2635 CC = ISD::SETGT; 2636 break; 2637 case ISD::SETUGT: 2638 Invert = true; 2639 CC = ISD::SETLE; 2640 SwapIfRegArgs = true; 2641 break; 2642 case ISD::SETULT: 2643 Invert = true; 2644 CC = ISD::SETGE; 2645 break; 2646 case ISD::SETUEQ: 2647 Invert = true; // Fallthrough 2648 case ISD::SETONE: 2649 // Expand this to (OGT |OLT). 2650 NeonCmpAlt = 2651 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT)); 2652 CC = ISD::SETLT; 2653 SwapIfRegArgs = true; 2654 break; 2655 case ISD::SETUO: 2656 Invert = true; // Fallthrough 2657 case ISD::SETO: 2658 // Expand this to (OGE | OLT). 2659 NeonCmpAlt = 2660 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE)); 2661 CC = ISD::SETLT; 2662 SwapIfRegArgs = true; 2663 break; 2664 } 2665 2666 if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) { 2667 CC = getSetCCSwappedOperands(CC); 2668 std::swap(Op0, Op1); 2669 } 2670 2671 // Generate FP Compare Mask instr or FP Compare Mask against Zero instr 2672 SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); 2673 2674 if (NeonCmpAlt.getNode()) 2675 NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt); 2676 2677 if (Invert) 2678 NeonCmp = DAG.getNOT(DL, NeonCmp, VT); 2679 2680 return NeonCmp; 2681} 2682 2683// (SETCC lhs, rhs, condcode) 2684SDValue 2685AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2686 SDLoc dl(Op); 2687 SDValue LHS = Op.getOperand(0); 2688 SDValue RHS = Op.getOperand(1); 2689 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2690 EVT VT = Op.getValueType(); 2691 2692 if (VT.isVector()) 2693 return LowerVectorSETCC(Op, DAG); 2694 2695 if (LHS.getValueType() == MVT::f128) { 2696 // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS 2697 // for the rest of the function (some i32 or i64 values). 2698 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2699 2700 // If softenSetCCOperands returned a scalar, use it. 2701 if (RHS.getNode() == 0) { 2702 assert(LHS.getValueType() == Op.getValueType() && 2703 "Unexpected setcc expansion!"); 2704 return LHS; 2705 } 2706 } 2707 2708 if (LHS.getValueType().isInteger()) { 2709 SDValue A64cc; 2710 2711 // Integers are handled in a separate function because the combinations of 2712 // immediates and tests can get hairy and we may want to fiddle things. 2713 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2714 2715 return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2716 CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT), 2717 A64cc); 2718 } 2719 2720 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2721 // conditional branch, hence FPCCToA64CC can set a second test, where either 2722 // passing is sufficient. 2723 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2724 CondCode = FPCCToA64CC(CC, Alternative); 2725 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2726 SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2727 DAG.getCondCode(CC)); 2728 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2729 CmpOp, DAG.getConstant(1, VT), 2730 DAG.getConstant(0, VT), A64cc); 2731 2732 if (Alternative != A64CC::Invalid) { 2733 A64cc = DAG.getConstant(Alternative, MVT::i32); 2734 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp, 2735 DAG.getConstant(1, VT), A64SELECT_CC, A64cc); 2736 } 2737 2738 return A64SELECT_CC; 2739} 2740 2741SDValue 2742AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2743 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2744 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2745 2746 // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes 2747 // rather than just 8. 2748 return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), 2749 Op.getOperand(1), Op.getOperand(2), 2750 DAG.getConstant(32, MVT::i32), 8, false, false, 2751 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 2752} 2753 2754SDValue 2755AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2756 // The layout of the va_list struct is specified in the AArch64 Procedure Call 2757 // Standard, section B.3. 2758 MachineFunction &MF = DAG.getMachineFunction(); 2759 AArch64MachineFunctionInfo *FuncInfo 2760 = MF.getInfo<AArch64MachineFunctionInfo>(); 2761 SDLoc DL(Op); 2762 2763 SDValue Chain = Op.getOperand(0); 2764 SDValue VAList = Op.getOperand(1); 2765 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2766 SmallVector<SDValue, 4> MemOps; 2767 2768 // void *__stack at offset 0 2769 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(), 2770 getPointerTy()); 2771 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 2772 MachinePointerInfo(SV), false, false, 0)); 2773 2774 // void *__gr_top at offset 8 2775 int GPRSize = FuncInfo->getVariadicGPRSize(); 2776 if (GPRSize > 0) { 2777 SDValue GRTop, GRTopAddr; 2778 2779 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2780 DAG.getConstant(8, getPointerTy())); 2781 2782 GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy()); 2783 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 2784 DAG.getConstant(GPRSize, getPointerTy())); 2785 2786 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 2787 MachinePointerInfo(SV, 8), 2788 false, false, 0)); 2789 } 2790 2791 // void *__vr_top at offset 16 2792 int FPRSize = FuncInfo->getVariadicFPRSize(); 2793 if (FPRSize > 0) { 2794 SDValue VRTop, VRTopAddr; 2795 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2796 DAG.getConstant(16, getPointerTy())); 2797 2798 VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy()); 2799 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 2800 DAG.getConstant(FPRSize, getPointerTy())); 2801 2802 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 2803 MachinePointerInfo(SV, 16), 2804 false, false, 0)); 2805 } 2806 2807 // int __gr_offs at offset 24 2808 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2809 DAG.getConstant(24, getPointerTy())); 2810 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 2811 GROffsAddr, MachinePointerInfo(SV, 24), 2812 false, false, 0)); 2813 2814 // int __vr_offs at offset 28 2815 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2816 DAG.getConstant(28, getPointerTy())); 2817 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 2818 VROffsAddr, MachinePointerInfo(SV, 28), 2819 false, false, 0)); 2820 2821 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 2822 MemOps.size()); 2823} 2824 2825SDValue 2826AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2827 switch (Op.getOpcode()) { 2828 default: llvm_unreachable("Don't know how to custom lower this!"); 2829 case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128); 2830 case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128); 2831 case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128); 2832 case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128); 2833 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true); 2834 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false); 2835 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true); 2836 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); 2837 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 2838 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 2839 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 2840 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 2841 2842 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 2843 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 2844 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 2845 case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG); 2846 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 2847 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 2848 case ISD::SELECT: return LowerSELECT(Op, DAG); 2849 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 2850 case ISD::SETCC: return LowerSETCC(Op, DAG); 2851 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 2852 case ISD::VASTART: return LowerVASTART(Op, DAG); 2853 case ISD::BUILD_VECTOR: 2854 return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); 2855 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 2856 } 2857 2858 return SDValue(); 2859} 2860 2861/// Check if the specified splat value corresponds to a valid vector constant 2862/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If 2863/// so, return the encoded 8-bit immediate and the OpCmode instruction fields 2864/// values. 2865static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 2866 unsigned SplatBitSize, SelectionDAG &DAG, 2867 bool is128Bits, NeonModImmType type, EVT &VT, 2868 unsigned &Imm, unsigned &OpCmode) { 2869 switch (SplatBitSize) { 2870 default: 2871 llvm_unreachable("unexpected size for isNeonModifiedImm"); 2872 case 8: { 2873 if (type != Neon_Mov_Imm) 2874 return false; 2875 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 2876 // Neon movi per byte: Op=0, Cmode=1110. 2877 OpCmode = 0xe; 2878 Imm = SplatBits; 2879 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 2880 break; 2881 } 2882 case 16: { 2883 // Neon move inst per halfword 2884 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 2885 if ((SplatBits & ~0xff) == 0) { 2886 // Value = 0x00nn is 0x00nn LSL 0 2887 // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000 2888 // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001 2889 // Op=x, Cmode=100y 2890 Imm = SplatBits; 2891 OpCmode = 0x8; 2892 break; 2893 } 2894 if ((SplatBits & ~0xff00) == 0) { 2895 // Value = 0xnn00 is 0x00nn LSL 8 2896 // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010 2897 // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011 2898 // Op=x, Cmode=101x 2899 Imm = SplatBits >> 8; 2900 OpCmode = 0xa; 2901 break; 2902 } 2903 // can't handle any other 2904 return false; 2905 } 2906 2907 case 32: { 2908 // First the LSL variants (MSL is unusable by some interested instructions). 2909 2910 // Neon move instr per word, shift zeros 2911 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 2912 if ((SplatBits & ~0xff) == 0) { 2913 // Value = 0x000000nn is 0x000000nn LSL 0 2914 // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000 2915 // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001 2916 // Op=x, Cmode=000x 2917 Imm = SplatBits; 2918 OpCmode = 0; 2919 break; 2920 } 2921 if ((SplatBits & ~0xff00) == 0) { 2922 // Value = 0x0000nn00 is 0x000000nn LSL 8 2923 // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010 2924 // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011 2925 // Op=x, Cmode=001x 2926 Imm = SplatBits >> 8; 2927 OpCmode = 0x2; 2928 break; 2929 } 2930 if ((SplatBits & ~0xff0000) == 0) { 2931 // Value = 0x00nn0000 is 0x000000nn LSL 16 2932 // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100 2933 // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101 2934 // Op=x, Cmode=010x 2935 Imm = SplatBits >> 16; 2936 OpCmode = 0x4; 2937 break; 2938 } 2939 if ((SplatBits & ~0xff000000) == 0) { 2940 // Value = 0xnn000000 is 0x000000nn LSL 24 2941 // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110 2942 // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111 2943 // Op=x, Cmode=011x 2944 Imm = SplatBits >> 24; 2945 OpCmode = 0x6; 2946 break; 2947 } 2948 2949 // Now the MSL immediates. 2950 2951 // Neon move instr per word, shift ones 2952 if ((SplatBits & ~0xffff) == 0 && 2953 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 2954 // Value = 0x0000nnff is 0x000000nn MSL 8 2955 // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100 2956 // Op=x, Cmode=1100 2957 Imm = SplatBits >> 8; 2958 OpCmode = 0xc; 2959 break; 2960 } 2961 if ((SplatBits & ~0xffffff) == 0 && 2962 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 2963 // Value = 0x00nnffff is 0x000000nn MSL 16 2964 // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101 2965 // Op=x, Cmode=1101 2966 Imm = SplatBits >> 16; 2967 OpCmode = 0xd; 2968 break; 2969 } 2970 // can't handle any other 2971 return false; 2972 } 2973 2974 case 64: { 2975 if (type != Neon_Mov_Imm) 2976 return false; 2977 // Neon move instr bytemask, where each byte is either 0x00 or 0xff. 2978 // movi Op=1, Cmode=1110. 2979 OpCmode = 0x1e; 2980 uint64_t BitMask = 0xff; 2981 uint64_t Val = 0; 2982 unsigned ImmMask = 1; 2983 Imm = 0; 2984 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 2985 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 2986 Val |= BitMask; 2987 Imm |= ImmMask; 2988 } else if ((SplatBits & BitMask) != 0) { 2989 return false; 2990 } 2991 BitMask <<= 8; 2992 ImmMask <<= 1; 2993 } 2994 SplatBits = Val; 2995 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 2996 break; 2997 } 2998 } 2999 3000 return true; 3001} 3002 3003static SDValue PerformANDCombine(SDNode *N, 3004 TargetLowering::DAGCombinerInfo &DCI) { 3005 3006 SelectionDAG &DAG = DCI.DAG; 3007 SDLoc DL(N); 3008 EVT VT = N->getValueType(0); 3009 3010 // We're looking for an SRA/SHL pair which form an SBFX. 3011 3012 if (VT != MVT::i32 && VT != MVT::i64) 3013 return SDValue(); 3014 3015 if (!isa<ConstantSDNode>(N->getOperand(1))) 3016 return SDValue(); 3017 3018 uint64_t TruncMask = N->getConstantOperandVal(1); 3019 if (!isMask_64(TruncMask)) 3020 return SDValue(); 3021 3022 uint64_t Width = CountPopulation_64(TruncMask); 3023 SDValue Shift = N->getOperand(0); 3024 3025 if (Shift.getOpcode() != ISD::SRL) 3026 return SDValue(); 3027 3028 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 3029 return SDValue(); 3030 uint64_t LSB = Shift->getConstantOperandVal(1); 3031 3032 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 3033 return SDValue(); 3034 3035 return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0), 3036 DAG.getConstant(LSB, MVT::i64), 3037 DAG.getConstant(LSB + Width - 1, MVT::i64)); 3038} 3039 3040/// For a true bitfield insert, the bits getting into that contiguous mask 3041/// should come from the low part of an existing value: they must be formed from 3042/// a compatible SHL operation (unless they're already low). This function 3043/// checks that condition and returns the least-significant bit that's 3044/// intended. If the operation not a field preparation, -1 is returned. 3045static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT, 3046 SDValue &MaskedVal, uint64_t Mask) { 3047 if (!isShiftedMask_64(Mask)) 3048 return -1; 3049 3050 // Now we need to alter MaskedVal so that it is an appropriate input for a BFI 3051 // instruction. BFI will do a left-shift by LSB before applying the mask we've 3052 // spotted, so in general we should pre-emptively "undo" that by making sure 3053 // the incoming bits have had a right-shift applied to them. 3054 // 3055 // This right shift, however, will combine with existing left/right shifts. In 3056 // the simplest case of a completely straight bitfield operation, it will be 3057 // expected to completely cancel out with an existing SHL. More complicated 3058 // cases (e.g. bitfield to bitfield copy) may still need a real shift before 3059 // the BFI. 3060 3061 uint64_t LSB = countTrailingZeros(Mask); 3062 int64_t ShiftRightRequired = LSB; 3063 if (MaskedVal.getOpcode() == ISD::SHL && 3064 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 3065 ShiftRightRequired -= MaskedVal.getConstantOperandVal(1); 3066 MaskedVal = MaskedVal.getOperand(0); 3067 } else if (MaskedVal.getOpcode() == ISD::SRL && 3068 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 3069 ShiftRightRequired += MaskedVal.getConstantOperandVal(1); 3070 MaskedVal = MaskedVal.getOperand(0); 3071 } 3072 3073 if (ShiftRightRequired > 0) 3074 MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal, 3075 DAG.getConstant(ShiftRightRequired, MVT::i64)); 3076 else if (ShiftRightRequired < 0) { 3077 // We could actually end up with a residual left shift, for example with 3078 // "struc.bitfield = val << 1". 3079 MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal, 3080 DAG.getConstant(-ShiftRightRequired, MVT::i64)); 3081 } 3082 3083 return LSB; 3084} 3085 3086/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by 3087/// a mask and an extension. Returns true if a BFI was found and provides 3088/// information on its surroundings. 3089static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask, 3090 bool &Extended) { 3091 Extended = false; 3092 if (N.getOpcode() == ISD::ZERO_EXTEND) { 3093 Extended = true; 3094 N = N.getOperand(0); 3095 } 3096 3097 if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { 3098 Mask = N->getConstantOperandVal(1); 3099 N = N.getOperand(0); 3100 } else { 3101 // Mask is the whole width. 3102 Mask = -1ULL >> (64 - N.getValueType().getSizeInBits()); 3103 } 3104 3105 if (N.getOpcode() == AArch64ISD::BFI) { 3106 BFI = N; 3107 return true; 3108 } 3109 3110 return false; 3111} 3112 3113/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which 3114/// is roughly equivalent to (and (BFI ...), mask). This form is used because it 3115/// can often be further combined with a larger mask. Ultimately, we want mask 3116/// to be 2^32-1 or 2^64-1 so the AND can be skipped. 3117static SDValue tryCombineToBFI(SDNode *N, 3118 TargetLowering::DAGCombinerInfo &DCI, 3119 const AArch64Subtarget *Subtarget) { 3120 SelectionDAG &DAG = DCI.DAG; 3121 SDLoc DL(N); 3122 EVT VT = N->getValueType(0); 3123 3124 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 3125 3126 // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or 3127 // abandon the effort. 3128 SDValue LHS = N->getOperand(0); 3129 if (LHS.getOpcode() != ISD::AND) 3130 return SDValue(); 3131 3132 uint64_t LHSMask; 3133 if (isa<ConstantSDNode>(LHS.getOperand(1))) 3134 LHSMask = LHS->getConstantOperandVal(1); 3135 else 3136 return SDValue(); 3137 3138 // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask 3139 // is or abandon the effort. 3140 SDValue RHS = N->getOperand(1); 3141 if (RHS.getOpcode() != ISD::AND) 3142 return SDValue(); 3143 3144 uint64_t RHSMask; 3145 if (isa<ConstantSDNode>(RHS.getOperand(1))) 3146 RHSMask = RHS->getConstantOperandVal(1); 3147 else 3148 return SDValue(); 3149 3150 // Can't do anything if the masks are incompatible. 3151 if (LHSMask & RHSMask) 3152 return SDValue(); 3153 3154 // Now we need one of the masks to be a contiguous field. Without loss of 3155 // generality that should be the RHS one. 3156 SDValue Bitfield = LHS.getOperand(0); 3157 if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) { 3158 // We know that LHS is a candidate new value, and RHS isn't already a better 3159 // one. 3160 std::swap(LHS, RHS); 3161 std::swap(LHSMask, RHSMask); 3162 } 3163 3164 // We've done our best to put the right operands in the right places, all we 3165 // can do now is check whether a BFI exists. 3166 Bitfield = RHS.getOperand(0); 3167 int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask); 3168 if (LSB == -1) 3169 return SDValue(); 3170 3171 uint32_t Width = CountPopulation_64(RHSMask); 3172 assert(Width && "Expected non-zero bitfield width"); 3173 3174 SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 3175 LHS.getOperand(0), Bitfield, 3176 DAG.getConstant(LSB, MVT::i64), 3177 DAG.getConstant(Width, MVT::i64)); 3178 3179 // Mask is trivial 3180 if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 3181 return BFI; 3182 3183 return DAG.getNode(ISD::AND, DL, VT, BFI, 3184 DAG.getConstant(LHSMask | RHSMask, VT)); 3185} 3186 3187/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its 3188/// original input. This is surprisingly common because SROA splits things up 3189/// into i8 chunks, so the originally detected MaskedBFI may actually only act 3190/// on the low (say) byte of a word. This is then orred into the rest of the 3191/// word afterwards. 3192/// 3193/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)). 3194/// 3195/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the 3196/// MaskedBFI. We can also deal with a certain amount of extend/truncate being 3197/// involved. 3198static SDValue tryCombineToLargerBFI(SDNode *N, 3199 TargetLowering::DAGCombinerInfo &DCI, 3200 const AArch64Subtarget *Subtarget) { 3201 SelectionDAG &DAG = DCI.DAG; 3202 SDLoc DL(N); 3203 EVT VT = N->getValueType(0); 3204 3205 // First job is to hunt for a MaskedBFI on either the left or right. Swap 3206 // operands if it's actually on the right. 3207 SDValue BFI; 3208 SDValue PossExtraMask; 3209 uint64_t ExistingMask = 0; 3210 bool Extended = false; 3211 if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended)) 3212 PossExtraMask = N->getOperand(1); 3213 else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended)) 3214 PossExtraMask = N->getOperand(0); 3215 else 3216 return SDValue(); 3217 3218 // We can only combine a BFI with another compatible mask. 3219 if (PossExtraMask.getOpcode() != ISD::AND || 3220 !isa<ConstantSDNode>(PossExtraMask.getOperand(1))) 3221 return SDValue(); 3222 3223 uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1); 3224 3225 // Masks must be compatible. 3226 if (ExtraMask & ExistingMask) 3227 return SDValue(); 3228 3229 SDValue OldBFIVal = BFI.getOperand(0); 3230 SDValue NewBFIVal = BFI.getOperand(1); 3231 if (Extended) { 3232 // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be 3233 // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments 3234 // need to be made compatible. 3235 assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32 3236 && "Invalid types for BFI"); 3237 OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal); 3238 NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal); 3239 } 3240 3241 // We need the MaskedBFI to be combined with a mask of the *same* value. 3242 if (PossExtraMask.getOperand(0) != OldBFIVal) 3243 return SDValue(); 3244 3245 BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 3246 OldBFIVal, NewBFIVal, 3247 BFI.getOperand(2), BFI.getOperand(3)); 3248 3249 // If the masking is trivial, we don't need to create it. 3250 if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 3251 return BFI; 3252 3253 return DAG.getNode(ISD::AND, DL, VT, BFI, 3254 DAG.getConstant(ExtraMask | ExistingMask, VT)); 3255} 3256 3257/// An EXTR instruction is made up of two shifts, ORed together. This helper 3258/// searches for and classifies those shifts. 3259static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 3260 bool &FromHi) { 3261 if (N.getOpcode() == ISD::SHL) 3262 FromHi = false; 3263 else if (N.getOpcode() == ISD::SRL) 3264 FromHi = true; 3265 else 3266 return false; 3267 3268 if (!isa<ConstantSDNode>(N.getOperand(1))) 3269 return false; 3270 3271 ShiftAmount = N->getConstantOperandVal(1); 3272 Src = N->getOperand(0); 3273 return true; 3274} 3275 3276/// EXTR instruction extracts a contiguous chunk of bits from two existing 3277/// registers viewed as a high/low pair. This function looks for the pattern: 3278/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 3279/// EXTR. Can't quite be done in TableGen because the two immediates aren't 3280/// independent. 3281static SDValue tryCombineToEXTR(SDNode *N, 3282 TargetLowering::DAGCombinerInfo &DCI) { 3283 SelectionDAG &DAG = DCI.DAG; 3284 SDLoc DL(N); 3285 EVT VT = N->getValueType(0); 3286 3287 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 3288 3289 if (VT != MVT::i32 && VT != MVT::i64) 3290 return SDValue(); 3291 3292 SDValue LHS; 3293 uint32_t ShiftLHS = 0; 3294 bool LHSFromHi = 0; 3295 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 3296 return SDValue(); 3297 3298 SDValue RHS; 3299 uint32_t ShiftRHS = 0; 3300 bool RHSFromHi = 0; 3301 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 3302 return SDValue(); 3303 3304 // If they're both trying to come from the high part of the register, they're 3305 // not really an EXTR. 3306 if (LHSFromHi == RHSFromHi) 3307 return SDValue(); 3308 3309 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 3310 return SDValue(); 3311 3312 if (LHSFromHi) { 3313 std::swap(LHS, RHS); 3314 std::swap(ShiftLHS, ShiftRHS); 3315 } 3316 3317 return DAG.getNode(AArch64ISD::EXTR, DL, VT, 3318 LHS, RHS, 3319 DAG.getConstant(ShiftRHS, MVT::i64)); 3320} 3321 3322/// Target-specific dag combine xforms for ISD::OR 3323static SDValue PerformORCombine(SDNode *N, 3324 TargetLowering::DAGCombinerInfo &DCI, 3325 const AArch64Subtarget *Subtarget) { 3326 3327 SelectionDAG &DAG = DCI.DAG; 3328 SDLoc DL(N); 3329 EVT VT = N->getValueType(0); 3330 3331 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 3332 return SDValue(); 3333 3334 // Attempt to recognise bitfield-insert operations. 3335 SDValue Res = tryCombineToBFI(N, DCI, Subtarget); 3336 if (Res.getNode()) 3337 return Res; 3338 3339 // Attempt to combine an existing MaskedBFI operation into one with a larger 3340 // mask. 3341 Res = tryCombineToLargerBFI(N, DCI, Subtarget); 3342 if (Res.getNode()) 3343 return Res; 3344 3345 Res = tryCombineToEXTR(N, DCI); 3346 if (Res.getNode()) 3347 return Res; 3348 3349 if (!Subtarget->hasNEON()) 3350 return SDValue(); 3351 3352 // Attempt to use vector immediate-form BSL 3353 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 3354 3355 SDValue N0 = N->getOperand(0); 3356 if (N0.getOpcode() != ISD::AND) 3357 return SDValue(); 3358 3359 SDValue N1 = N->getOperand(1); 3360 if (N1.getOpcode() != ISD::AND) 3361 return SDValue(); 3362 3363 if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 3364 APInt SplatUndef; 3365 unsigned SplatBitSize; 3366 bool HasAnyUndefs; 3367 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 3368 APInt SplatBits0; 3369 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 3370 HasAnyUndefs) && 3371 !HasAnyUndefs) { 3372 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 3373 APInt SplatBits1; 3374 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 3375 HasAnyUndefs) && 3376 !HasAnyUndefs && SplatBits0 == ~SplatBits1) { 3377 // Canonicalize the vector type to make instruction selection simpler. 3378 EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; 3379 SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT, 3380 N0->getOperand(1), N0->getOperand(0), 3381 N1->getOperand(0)); 3382 return DAG.getNode(ISD::BITCAST, DL, VT, Result); 3383 } 3384 } 3385 } 3386 3387 return SDValue(); 3388} 3389 3390/// Target-specific dag combine xforms for ISD::SRA 3391static SDValue PerformSRACombine(SDNode *N, 3392 TargetLowering::DAGCombinerInfo &DCI) { 3393 3394 SelectionDAG &DAG = DCI.DAG; 3395 SDLoc DL(N); 3396 EVT VT = N->getValueType(0); 3397 3398 // We're looking for an SRA/SHL pair which form an SBFX. 3399 3400 if (VT != MVT::i32 && VT != MVT::i64) 3401 return SDValue(); 3402 3403 if (!isa<ConstantSDNode>(N->getOperand(1))) 3404 return SDValue(); 3405 3406 uint64_t ExtraSignBits = N->getConstantOperandVal(1); 3407 SDValue Shift = N->getOperand(0); 3408 3409 if (Shift.getOpcode() != ISD::SHL) 3410 return SDValue(); 3411 3412 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 3413 return SDValue(); 3414 3415 uint64_t BitsOnLeft = Shift->getConstantOperandVal(1); 3416 uint64_t Width = VT.getSizeInBits() - ExtraSignBits; 3417 uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft; 3418 3419 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 3420 return SDValue(); 3421 3422 return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0), 3423 DAG.getConstant(LSB, MVT::i64), 3424 DAG.getConstant(LSB + Width - 1, MVT::i64)); 3425} 3426 3427/// Check if this is a valid build_vector for the immediate operand of 3428/// a vector shift operation, where all the elements of the build_vector 3429/// must have the same constant integer value. 3430static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 3431 // Ignore bit_converts. 3432 while (Op.getOpcode() == ISD::BITCAST) 3433 Op = Op.getOperand(0); 3434 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 3435 APInt SplatBits, SplatUndef; 3436 unsigned SplatBitSize; 3437 bool HasAnyUndefs; 3438 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 3439 HasAnyUndefs, ElementBits) || 3440 SplatBitSize > ElementBits) 3441 return false; 3442 Cnt = SplatBits.getSExtValue(); 3443 return true; 3444} 3445 3446/// Check if this is a valid build_vector for the immediate operand of 3447/// a vector shift left operation. That value must be in the range: 3448/// 0 <= Value < ElementBits 3449static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) { 3450 assert(VT.isVector() && "vector shift count is not a vector type"); 3451 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 3452 if (!getVShiftImm(Op, ElementBits, Cnt)) 3453 return false; 3454 return (Cnt >= 0 && Cnt < ElementBits); 3455} 3456 3457/// Check if this is a valid build_vector for the immediate operand of a 3458/// vector shift right operation. The value must be in the range: 3459/// 1 <= Value <= ElementBits 3460static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) { 3461 assert(VT.isVector() && "vector shift count is not a vector type"); 3462 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 3463 if (!getVShiftImm(Op, ElementBits, Cnt)) 3464 return false; 3465 return (Cnt >= 1 && Cnt <= ElementBits); 3466} 3467 3468/// Checks for immediate versions of vector shifts and lowers them. 3469static SDValue PerformShiftCombine(SDNode *N, 3470 TargetLowering::DAGCombinerInfo &DCI, 3471 const AArch64Subtarget *ST) { 3472 SelectionDAG &DAG = DCI.DAG; 3473 EVT VT = N->getValueType(0); 3474 if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64)) 3475 return PerformSRACombine(N, DCI); 3476 3477 // Nothing to be done for scalar shifts. 3478 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3479 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 3480 return SDValue(); 3481 3482 assert(ST->hasNEON() && "unexpected vector shift"); 3483 int64_t Cnt; 3484 3485 switch (N->getOpcode()) { 3486 default: 3487 llvm_unreachable("unexpected shift opcode"); 3488 3489 case ISD::SHL: 3490 if (isVShiftLImm(N->getOperand(1), VT, Cnt)) { 3491 SDValue RHS = 3492 DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, 3493 DAG.getConstant(Cnt, MVT::i32)); 3494 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS); 3495 } 3496 break; 3497 3498 case ISD::SRA: 3499 case ISD::SRL: 3500 if (isVShiftRImm(N->getOperand(1), VT, Cnt)) { 3501 SDValue RHS = 3502 DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, 3503 DAG.getConstant(Cnt, MVT::i32)); 3504 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS); 3505 } 3506 break; 3507 } 3508 3509 return SDValue(); 3510} 3511 3512/// ARM-specific DAG combining for intrinsics. 3513static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { 3514 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 3515 3516 switch (IntNo) { 3517 default: 3518 // Don't do anything for most intrinsics. 3519 break; 3520 3521 case Intrinsic::arm_neon_vqshifts: 3522 case Intrinsic::arm_neon_vqshiftu: 3523 EVT VT = N->getOperand(1).getValueType(); 3524 int64_t Cnt; 3525 if (!isVShiftLImm(N->getOperand(2), VT, Cnt)) 3526 break; 3527 unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts) 3528 ? AArch64ISD::NEON_QSHLs 3529 : AArch64ISD::NEON_QSHLu; 3530 return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0), 3531 N->getOperand(1), DAG.getConstant(Cnt, MVT::i32)); 3532 } 3533 3534 return SDValue(); 3535} 3536 3537/// Target-specific DAG combine function for NEON load/store intrinsics 3538/// to merge base address updates. 3539static SDValue CombineBaseUpdate(SDNode *N, 3540 TargetLowering::DAGCombinerInfo &DCI) { 3541 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 3542 return SDValue(); 3543 3544 SelectionDAG &DAG = DCI.DAG; 3545 bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 3546 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 3547 unsigned AddrOpIdx = (isIntrinsic ? 2 : 1); 3548 SDValue Addr = N->getOperand(AddrOpIdx); 3549 3550 // Search for a use of the address operand that is an increment. 3551 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 3552 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 3553 SDNode *User = *UI; 3554 if (User->getOpcode() != ISD::ADD || 3555 UI.getUse().getResNo() != Addr.getResNo()) 3556 continue; 3557 3558 // Check that the add is independent of the load/store. Otherwise, folding 3559 // it would create a cycle. 3560 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 3561 continue; 3562 3563 // Find the new opcode for the updating load/store. 3564 bool isLoad = true; 3565 bool isLaneOp = false; 3566 unsigned NewOpc = 0; 3567 unsigned NumVecs = 0; 3568 if (isIntrinsic) { 3569 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 3570 switch (IntNo) { 3571 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 3572 case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD; 3573 NumVecs = 1; break; 3574 case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD; 3575 NumVecs = 2; break; 3576 case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD; 3577 NumVecs = 3; break; 3578 case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD; 3579 NumVecs = 4; break; 3580 case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD; 3581 NumVecs = 1; isLoad = false; break; 3582 case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD; 3583 NumVecs = 2; isLoad = false; break; 3584 case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD; 3585 NumVecs = 3; isLoad = false; break; 3586 case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD; 3587 NumVecs = 4; isLoad = false; break; 3588 case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD; 3589 NumVecs = 2; break; 3590 case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD; 3591 NumVecs = 3; break; 3592 case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD; 3593 NumVecs = 4; break; 3594 case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD; 3595 NumVecs = 2; isLoad = false; break; 3596 case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD; 3597 NumVecs = 3; isLoad = false; break; 3598 case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD; 3599 NumVecs = 4; isLoad = false; break; 3600 case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD; 3601 NumVecs = 2; isLaneOp = true; break; 3602 case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD; 3603 NumVecs = 3; isLaneOp = true; break; 3604 case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD; 3605 NumVecs = 4; isLaneOp = true; break; 3606 case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD; 3607 NumVecs = 2; isLoad = false; isLaneOp = true; break; 3608 case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD; 3609 NumVecs = 3; isLoad = false; isLaneOp = true; break; 3610 case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD; 3611 NumVecs = 4; isLoad = false; isLaneOp = true; break; 3612 } 3613 } else { 3614 isLaneOp = true; 3615 switch (N->getOpcode()) { 3616 default: llvm_unreachable("unexpected opcode for Neon base update"); 3617 case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD; 3618 NumVecs = 2; break; 3619 case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD; 3620 NumVecs = 3; break; 3621 case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD; 3622 NumVecs = 4; break; 3623 } 3624 } 3625 3626 // Find the size of memory referenced by the load/store. 3627 EVT VecTy; 3628 if (isLoad) 3629 VecTy = N->getValueType(0); 3630 else 3631 VecTy = N->getOperand(AddrOpIdx + 1).getValueType(); 3632 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 3633 if (isLaneOp) 3634 NumBytes /= VecTy.getVectorNumElements(); 3635 3636 // If the increment is a constant, it must match the memory ref size. 3637 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 3638 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 3639 uint32_t IncVal = CInc->getZExtValue(); 3640 if (IncVal != NumBytes) 3641 continue; 3642 Inc = DAG.getTargetConstant(IncVal, MVT::i32); 3643 } 3644 3645 // Create the new updating load/store node. 3646 EVT Tys[6]; 3647 unsigned NumResultVecs = (isLoad ? NumVecs : 0); 3648 unsigned n; 3649 for (n = 0; n < NumResultVecs; ++n) 3650 Tys[n] = VecTy; 3651 Tys[n++] = MVT::i64; 3652 Tys[n] = MVT::Other; 3653 SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2); 3654 SmallVector<SDValue, 8> Ops; 3655 Ops.push_back(N->getOperand(0)); // incoming chain 3656 Ops.push_back(N->getOperand(AddrOpIdx)); 3657 Ops.push_back(Inc); 3658 for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) { 3659 Ops.push_back(N->getOperand(i)); 3660 } 3661 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 3662 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, 3663 Ops.data(), Ops.size(), 3664 MemInt->getMemoryVT(), 3665 MemInt->getMemOperand()); 3666 3667 // Update the uses. 3668 std::vector<SDValue> NewResults; 3669 for (unsigned i = 0; i < NumResultVecs; ++i) { 3670 NewResults.push_back(SDValue(UpdN.getNode(), i)); 3671 } 3672 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 3673 DCI.CombineTo(N, NewResults); 3674 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 3675 3676 break; 3677 } 3678 return SDValue(); 3679} 3680 3681/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) 3682/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs. 3683/// If so, combine them to a vldN-dup operation and return true. 3684static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 3685 SelectionDAG &DAG = DCI.DAG; 3686 EVT VT = N->getValueType(0); 3687 3688 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 3689 SDNode *VLD = N->getOperand(0).getNode(); 3690 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 3691 return SDValue(); 3692 unsigned NumVecs = 0; 3693 unsigned NewOpc = 0; 3694 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 3695 if (IntNo == Intrinsic::arm_neon_vld2lane) { 3696 NumVecs = 2; 3697 NewOpc = AArch64ISD::NEON_LD2DUP; 3698 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 3699 NumVecs = 3; 3700 NewOpc = AArch64ISD::NEON_LD3DUP; 3701 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 3702 NumVecs = 4; 3703 NewOpc = AArch64ISD::NEON_LD4DUP; 3704 } else { 3705 return SDValue(); 3706 } 3707 3708 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 3709 // numbers match the load. 3710 unsigned VLDLaneNo = 3711 cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue(); 3712 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 3713 UI != UE; ++UI) { 3714 // Ignore uses of the chain result. 3715 if (UI.getUse().getResNo() == NumVecs) 3716 continue; 3717 SDNode *User = *UI; 3718 if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE || 3719 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 3720 return SDValue(); 3721 } 3722 3723 // Create the vldN-dup node. 3724 EVT Tys[5]; 3725 unsigned n; 3726 for (n = 0; n < NumVecs; ++n) 3727 Tys[n] = VT; 3728 Tys[n] = MVT::Other; 3729 SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1); 3730 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 3731 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 3732 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2, 3733 VLDMemInt->getMemoryVT(), 3734 VLDMemInt->getMemOperand()); 3735 3736 // Update the uses. 3737 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 3738 UI != UE; ++UI) { 3739 unsigned ResNo = UI.getUse().getResNo(); 3740 // Ignore uses of the chain result. 3741 if (ResNo == NumVecs) 3742 continue; 3743 SDNode *User = *UI; 3744 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 3745 } 3746 3747 // Now the vldN-lane intrinsic is dead except for its chain result. 3748 // Update uses of the chain. 3749 std::vector<SDValue> VLDDupResults; 3750 for (unsigned n = 0; n < NumVecs; ++n) 3751 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 3752 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 3753 DCI.CombineTo(VLD, VLDDupResults); 3754 3755 return SDValue(N, 0); 3756} 3757 3758SDValue 3759AArch64TargetLowering::PerformDAGCombine(SDNode *N, 3760 DAGCombinerInfo &DCI) const { 3761 switch (N->getOpcode()) { 3762 default: break; 3763 case ISD::AND: return PerformANDCombine(N, DCI); 3764 case ISD::OR: return PerformORCombine(N, DCI, getSubtarget()); 3765 case ISD::SHL: 3766 case ISD::SRA: 3767 case ISD::SRL: 3768 return PerformShiftCombine(N, DCI, getSubtarget()); 3769 case ISD::INTRINSIC_WO_CHAIN: 3770 return PerformIntrinsicCombine(N, DCI.DAG); 3771 case AArch64ISD::NEON_VDUPLANE: 3772 return CombineVLDDUP(N, DCI); 3773 case AArch64ISD::NEON_LD2DUP: 3774 case AArch64ISD::NEON_LD3DUP: 3775 case AArch64ISD::NEON_LD4DUP: 3776 return CombineBaseUpdate(N, DCI); 3777 case ISD::INTRINSIC_VOID: 3778 case ISD::INTRINSIC_W_CHAIN: 3779 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 3780 case Intrinsic::arm_neon_vld1: 3781 case Intrinsic::arm_neon_vld2: 3782 case Intrinsic::arm_neon_vld3: 3783 case Intrinsic::arm_neon_vld4: 3784 case Intrinsic::arm_neon_vst1: 3785 case Intrinsic::arm_neon_vst2: 3786 case Intrinsic::arm_neon_vst3: 3787 case Intrinsic::arm_neon_vst4: 3788 case Intrinsic::arm_neon_vld2lane: 3789 case Intrinsic::arm_neon_vld3lane: 3790 case Intrinsic::arm_neon_vld4lane: 3791 case Intrinsic::aarch64_neon_vld1x2: 3792 case Intrinsic::aarch64_neon_vld1x3: 3793 case Intrinsic::aarch64_neon_vld1x4: 3794 case Intrinsic::aarch64_neon_vst1x2: 3795 case Intrinsic::aarch64_neon_vst1x3: 3796 case Intrinsic::aarch64_neon_vst1x4: 3797 case Intrinsic::arm_neon_vst2lane: 3798 case Intrinsic::arm_neon_vst3lane: 3799 case Intrinsic::arm_neon_vst4lane: 3800 return CombineBaseUpdate(N, DCI); 3801 default: 3802 break; 3803 } 3804 } 3805 return SDValue(); 3806} 3807 3808bool 3809AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 3810 VT = VT.getScalarType(); 3811 3812 if (!VT.isSimple()) 3813 return false; 3814 3815 switch (VT.getSimpleVT().SimpleTy) { 3816 case MVT::f16: 3817 case MVT::f32: 3818 case MVT::f64: 3819 return true; 3820 case MVT::f128: 3821 return false; 3822 default: 3823 break; 3824 } 3825 3826 return false; 3827} 3828 3829// If this is a case we can't handle, return null and let the default 3830// expansion code take care of it. 3831SDValue 3832AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 3833 const AArch64Subtarget *ST) const { 3834 3835 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 3836 SDLoc DL(Op); 3837 EVT VT = Op.getValueType(); 3838 3839 APInt SplatBits, SplatUndef; 3840 unsigned SplatBitSize; 3841 bool HasAnyUndefs; 3842 3843 unsigned UseNeonMov = VT.getSizeInBits() >= 64; 3844 3845 // Note we favor lowering MOVI over MVNI. 3846 // This has implications on the definition of patterns in TableGen to select 3847 // BIC immediate instructions but not ORR immediate instructions. 3848 // If this lowering order is changed, TableGen patterns for BIC immediate and 3849 // ORR immediate instructions have to be updated. 3850 if (UseNeonMov && 3851 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 3852 if (SplatBitSize <= 64) { 3853 // First attempt to use vector immediate-form MOVI 3854 EVT NeonMovVT; 3855 unsigned Imm = 0; 3856 unsigned OpCmode = 0; 3857 3858 if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 3859 SplatBitSize, DAG, VT.is128BitVector(), 3860 Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) { 3861 SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); 3862 SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); 3863 3864 if (ImmVal.getNode() && OpCmodeVal.getNode()) { 3865 SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT, 3866 ImmVal, OpCmodeVal); 3867 return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); 3868 } 3869 } 3870 3871 // Then attempt to use vector immediate-form MVNI 3872 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 3873 if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 3874 DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT, 3875 Imm, OpCmode)) { 3876 SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); 3877 SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); 3878 if (ImmVal.getNode() && OpCmodeVal.getNode()) { 3879 SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT, 3880 ImmVal, OpCmodeVal); 3881 return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); 3882 } 3883 } 3884 3885 // Attempt to use vector immediate-form FMOV 3886 if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) || 3887 (VT == MVT::v2f64 && SplatBitSize == 64)) { 3888 APFloat RealVal( 3889 SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble, 3890 SplatBits); 3891 uint32_t ImmVal; 3892 if (A64Imms::isFPImm(RealVal, ImmVal)) { 3893 SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); 3894 return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val); 3895 } 3896 } 3897 } 3898 } 3899 3900 unsigned NumElts = VT.getVectorNumElements(); 3901 bool isOnlyLowElement = true; 3902 bool usesOnlyOneValue = true; 3903 bool hasDominantValue = false; 3904 bool isConstant = true; 3905 3906 // Map of the number of times a particular SDValue appears in the 3907 // element list. 3908 DenseMap<SDValue, unsigned> ValueCounts; 3909 SDValue Value; 3910 for (unsigned i = 0; i < NumElts; ++i) { 3911 SDValue V = Op.getOperand(i); 3912 if (V.getOpcode() == ISD::UNDEF) 3913 continue; 3914 if (i > 0) 3915 isOnlyLowElement = false; 3916 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 3917 isConstant = false; 3918 3919 ValueCounts.insert(std::make_pair(V, 0)); 3920 unsigned &Count = ValueCounts[V]; 3921 3922 // Is this value dominant? (takes up more than half of the lanes) 3923 if (++Count > (NumElts / 2)) { 3924 hasDominantValue = true; 3925 Value = V; 3926 } 3927 } 3928 if (ValueCounts.size() != 1) 3929 usesOnlyOneValue = false; 3930 if (!Value.getNode() && ValueCounts.size() > 0) 3931 Value = ValueCounts.begin()->first; 3932 3933 if (ValueCounts.size() == 0) 3934 return DAG.getUNDEF(VT); 3935 3936 // Loads are better lowered with insert_vector_elt. 3937 // Keep going if we are hitting this case. 3938 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 3939 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); 3940 3941 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 3942 // Use VDUP for non-constant splats. 3943 if (hasDominantValue && EltSize <= 64) { 3944 if (!isConstant) { 3945 SDValue N; 3946 3947 // If we are DUPing a value that comes directly from a vector, we could 3948 // just use DUPLANE. We can only do this if the lane being extracted 3949 // is at a constant index, as the DUP from lane instructions only have 3950 // constant-index forms. 3951 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 3952 isa<ConstantSDNode>(Value->getOperand(1))) { 3953 N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, 3954 Value->getOperand(0), Value->getOperand(1)); 3955 } else 3956 N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); 3957 3958 if (!usesOnlyOneValue) { 3959 // The dominant value was splatted as 'N', but we now have to insert 3960 // all differing elements. 3961 for (unsigned I = 0; I < NumElts; ++I) { 3962 if (Op.getOperand(I) == Value) 3963 continue; 3964 SmallVector<SDValue, 3> Ops; 3965 Ops.push_back(N); 3966 Ops.push_back(Op.getOperand(I)); 3967 Ops.push_back(DAG.getConstant(I, MVT::i32)); 3968 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); 3969 } 3970 } 3971 return N; 3972 } 3973 if (usesOnlyOneValue && isConstant) { 3974 return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); 3975 } 3976 } 3977 // If all elements are constants and the case above didn't get hit, fall back 3978 // to the default expansion, which will generate a load from the constant 3979 // pool. 3980 if (isConstant) 3981 return SDValue(); 3982 3983 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 3984 // know the default expansion would otherwise fall back on something even 3985 // worse. For a vector with one or two non-undef values, that's 3986 // scalar_to_vector for the elements followed by a shuffle (provided the 3987 // shuffle is valid for the target) and materialization element by element 3988 // on the stack followed by a load for everything else. 3989 if (!isConstant && !usesOnlyOneValue) { 3990 SDValue Vec = DAG.getUNDEF(VT); 3991 for (unsigned i = 0 ; i < NumElts; ++i) { 3992 SDValue V = Op.getOperand(i); 3993 if (V.getOpcode() == ISD::UNDEF) 3994 continue; 3995 SDValue LaneIdx = DAG.getConstant(i, MVT::i32); 3996 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); 3997 } 3998 return Vec; 3999 } 4000 return SDValue(); 4001} 4002 4003/// isREVMask - Check if a vector shuffle corresponds to a REV 4004/// instruction with the specified blocksize. (The order of the elements 4005/// within each block of the vector is reversed.) 4006static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 4007 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 4008 "Only possible block sizes for REV are: 16, 32, 64"); 4009 4010 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4011 if (EltSz == 64) 4012 return false; 4013 4014 unsigned NumElts = VT.getVectorNumElements(); 4015 unsigned BlockElts = M[0] + 1; 4016 // If the first shuffle index is UNDEF, be optimistic. 4017 if (M[0] < 0) 4018 BlockElts = BlockSize / EltSz; 4019 4020 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 4021 return false; 4022 4023 for (unsigned i = 0; i < NumElts; ++i) { 4024 if (M[i] < 0) 4025 continue; // ignore UNDEF indices 4026 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 4027 return false; 4028 } 4029 4030 return true; 4031} 4032 4033SDValue 4034AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 4035 SelectionDAG &DAG) const { 4036 SDValue V1 = Op.getOperand(0); 4037 SDValue V2 = Op.getOperand(1); 4038 SDLoc dl(Op); 4039 EVT VT = Op.getValueType(); 4040 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 4041 4042 // Convert shuffles that are directly supported on NEON to target-specific 4043 // DAG nodes, instead of keeping them as shuffles and matching them again 4044 // during code selection. This is more efficient and avoids the possibility 4045 // of inconsistencies between legalization and selection. 4046 ArrayRef<int> ShuffleMask = SVN->getMask(); 4047 4048 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 4049 if (EltSize > 64) 4050 return SDValue(); 4051 4052 if (isREVMask(ShuffleMask, VT, 64)) 4053 return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1); 4054 if (isREVMask(ShuffleMask, VT, 32)) 4055 return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1); 4056 if (isREVMask(ShuffleMask, VT, 16)) 4057 return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1); 4058 4059 // If the element of shuffle mask are all the same constant, we can 4060 // transform it into either NEON_VDUP or NEON_VDUPLANE 4061 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { 4062 int Lane = SVN->getSplatIndex(); 4063 // If this is undef splat, generate it via "just" vdup, if possible. 4064 if (Lane == -1) Lane = 0; 4065 4066 // Test if V1 is a SCALAR_TO_VECTOR. 4067 if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4068 return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0)); 4069 } 4070 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR. 4071 if (V1.getOpcode() == ISD::BUILD_VECTOR) { 4072 bool IsScalarToVector = true; 4073 for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i) 4074 if (V1.getOperand(i).getOpcode() != ISD::UNDEF && 4075 i != (unsigned)Lane) { 4076 IsScalarToVector = false; 4077 break; 4078 } 4079 if (IsScalarToVector) 4080 return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, 4081 V1.getOperand(Lane)); 4082 } 4083 return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, 4084 DAG.getConstant(Lane, MVT::i64)); 4085 } 4086 4087 int Length = ShuffleMask.size(); 4088 int V1EltNum = V1.getValueType().getVectorNumElements(); 4089 4090 // If the number of v1 elements is the same as the number of shuffle mask 4091 // element and the shuffle masks are sequential values, we can transform 4092 // it into NEON_VEXTRACT. 4093 if (V1EltNum == Length) { 4094 // Check if the shuffle mask is sequential. 4095 bool IsSequential = true; 4096 int CurMask = ShuffleMask[0]; 4097 for (int I = 0; I < Length; ++I) { 4098 if (ShuffleMask[I] != CurMask) { 4099 IsSequential = false; 4100 break; 4101 } 4102 CurMask++; 4103 } 4104 if (IsSequential) { 4105 assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect"); 4106 unsigned VecSize = EltSize * V1EltNum; 4107 unsigned Index = (EltSize/8) * ShuffleMask[0]; 4108 if (VecSize == 64 || VecSize == 128) 4109 return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2, 4110 DAG.getConstant(Index, MVT::i64)); 4111 } 4112 } 4113 4114 // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert 4115 // by element from V2 to V1 . 4116 // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a 4117 // better choice to be inserted than V1 as less insert needed, so we count 4118 // element to be inserted for both V1 and V2, and select less one as insert 4119 // target. 4120 4121 // Collect elements need to be inserted and their index. 4122 SmallVector<int, 8> NV1Elt; 4123 SmallVector<int, 8> N1Index; 4124 SmallVector<int, 8> NV2Elt; 4125 SmallVector<int, 8> N2Index; 4126 for (int I = 0; I != Length; ++I) { 4127 if (ShuffleMask[I] != I) { 4128 NV1Elt.push_back(ShuffleMask[I]); 4129 N1Index.push_back(I); 4130 } 4131 } 4132 for (int I = 0; I != Length; ++I) { 4133 if (ShuffleMask[I] != (I + V1EltNum)) { 4134 NV2Elt.push_back(ShuffleMask[I]); 4135 N2Index.push_back(I); 4136 } 4137 } 4138 4139 // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2 4140 // will be inserted. 4141 SDValue InsV = V1; 4142 SmallVector<int, 8> InsMasks = NV1Elt; 4143 SmallVector<int, 8> InsIndex = N1Index; 4144 if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) { 4145 if (NV1Elt.size() > NV2Elt.size()) { 4146 InsV = V2; 4147 InsMasks = NV2Elt; 4148 InsIndex = N2Index; 4149 } 4150 } else { 4151 InsV = DAG.getNode(ISD::UNDEF, dl, VT); 4152 } 4153 4154 for (int I = 0, E = InsMasks.size(); I != E; ++I) { 4155 SDValue ExtV = V1; 4156 int Mask = InsMasks[I]; 4157 if (Mask >= V1EltNum) { 4158 ExtV = V2; 4159 Mask -= V1EltNum; 4160 } 4161 // Any value type smaller than i32 is illegal in AArch64, and this lower 4162 // function is called after legalize pass, so we need to legalize 4163 // the result here. 4164 EVT EltVT; 4165 if (VT.getVectorElementType().isFloatingPoint()) 4166 EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32; 4167 else 4168 EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32; 4169 4170 ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, 4171 DAG.getConstant(Mask, MVT::i64)); 4172 InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV, 4173 DAG.getConstant(InsIndex[I], MVT::i64)); 4174 } 4175 return InsV; 4176} 4177 4178AArch64TargetLowering::ConstraintType 4179AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 4180 if (Constraint.size() == 1) { 4181 switch (Constraint[0]) { 4182 default: break; 4183 case 'w': // An FP/SIMD vector register 4184 return C_RegisterClass; 4185 case 'I': // Constant that can be used with an ADD instruction 4186 case 'J': // Constant that can be used with a SUB instruction 4187 case 'K': // Constant that can be used with a 32-bit logical instruction 4188 case 'L': // Constant that can be used with a 64-bit logical instruction 4189 case 'M': // Constant that can be used as a 32-bit MOV immediate 4190 case 'N': // Constant that can be used as a 64-bit MOV immediate 4191 case 'Y': // Floating point constant zero 4192 case 'Z': // Integer constant zero 4193 return C_Other; 4194 case 'Q': // A memory reference with base register and no offset 4195 return C_Memory; 4196 case 'S': // A symbolic address 4197 return C_Other; 4198 } 4199 } 4200 4201 // FIXME: Ump, Utf, Usa, Ush 4202 // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes, 4203 // whatever they may be 4204 // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be 4205 // Usa: An absolute symbolic address 4206 // Ush: The high part (bits 32:12) of a pc-relative symbolic address 4207 assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa" 4208 && Constraint != "Ush" && "Unimplemented constraints"); 4209 4210 return TargetLowering::getConstraintType(Constraint); 4211} 4212 4213TargetLowering::ConstraintWeight 4214AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info, 4215 const char *Constraint) const { 4216 4217 llvm_unreachable("Constraint weight unimplemented"); 4218} 4219 4220void 4221AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 4222 std::string &Constraint, 4223 std::vector<SDValue> &Ops, 4224 SelectionDAG &DAG) const { 4225 SDValue Result(0, 0); 4226 4227 // Only length 1 constraints are C_Other. 4228 if (Constraint.size() != 1) return; 4229 4230 // Only C_Other constraints get lowered like this. That means constants for us 4231 // so return early if there's no hope the constraint can be lowered. 4232 4233 switch(Constraint[0]) { 4234 default: break; 4235 case 'I': case 'J': case 'K': case 'L': 4236 case 'M': case 'N': case 'Z': { 4237 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4238 if (!C) 4239 return; 4240 4241 uint64_t CVal = C->getZExtValue(); 4242 uint32_t Bits; 4243 4244 switch (Constraint[0]) { 4245 default: 4246 // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J' 4247 // is a peculiarly useless SUB constraint. 4248 llvm_unreachable("Unimplemented C_Other constraint"); 4249 case 'I': 4250 if (CVal <= 0xfff) 4251 break; 4252 return; 4253 case 'K': 4254 if (A64Imms::isLogicalImm(32, CVal, Bits)) 4255 break; 4256 return; 4257 case 'L': 4258 if (A64Imms::isLogicalImm(64, CVal, Bits)) 4259 break; 4260 return; 4261 case 'Z': 4262 if (CVal == 0) 4263 break; 4264 return; 4265 } 4266 4267 Result = DAG.getTargetConstant(CVal, Op.getValueType()); 4268 break; 4269 } 4270 case 'S': { 4271 // An absolute symbolic address or label reference. 4272 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 4273 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 4274 GA->getValueType(0)); 4275 } else if (const BlockAddressSDNode *BA 4276 = dyn_cast<BlockAddressSDNode>(Op)) { 4277 Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), 4278 BA->getValueType(0)); 4279 } else if (const ExternalSymbolSDNode *ES 4280 = dyn_cast<ExternalSymbolSDNode>(Op)) { 4281 Result = DAG.getTargetExternalSymbol(ES->getSymbol(), 4282 ES->getValueType(0)); 4283 } else 4284 return; 4285 break; 4286 } 4287 case 'Y': 4288 if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 4289 if (CFP->isExactlyValue(0.0)) { 4290 Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0)); 4291 break; 4292 } 4293 } 4294 return; 4295 } 4296 4297 if (Result.getNode()) { 4298 Ops.push_back(Result); 4299 return; 4300 } 4301 4302 // It's an unknown constraint for us. Let generic code have a go. 4303 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4304} 4305 4306std::pair<unsigned, const TargetRegisterClass*> 4307AArch64TargetLowering::getRegForInlineAsmConstraint( 4308 const std::string &Constraint, 4309 MVT VT) const { 4310 if (Constraint.size() == 1) { 4311 switch (Constraint[0]) { 4312 case 'r': 4313 if (VT.getSizeInBits() <= 32) 4314 return std::make_pair(0U, &AArch64::GPR32RegClass); 4315 else if (VT == MVT::i64) 4316 return std::make_pair(0U, &AArch64::GPR64RegClass); 4317 break; 4318 case 'w': 4319 if (VT == MVT::f16) 4320 return std::make_pair(0U, &AArch64::FPR16RegClass); 4321 else if (VT == MVT::f32) 4322 return std::make_pair(0U, &AArch64::FPR32RegClass); 4323 else if (VT.getSizeInBits() == 64) 4324 return std::make_pair(0U, &AArch64::FPR64RegClass); 4325 else if (VT.getSizeInBits() == 128) 4326 return std::make_pair(0U, &AArch64::FPR128RegClass); 4327 break; 4328 } 4329 } 4330 4331 // Use the default implementation in TargetLowering to convert the register 4332 // constraint into a member of a register class. 4333 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 4334} 4335 4336/// Represent NEON load and store intrinsics as MemIntrinsicNodes. 4337/// The associated MachineMemOperands record the alignment specified 4338/// in the intrinsic calls. 4339bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 4340 const CallInst &I, 4341 unsigned Intrinsic) const { 4342 switch (Intrinsic) { 4343 case Intrinsic::arm_neon_vld1: 4344 case Intrinsic::arm_neon_vld2: 4345 case Intrinsic::arm_neon_vld3: 4346 case Intrinsic::arm_neon_vld4: 4347 case Intrinsic::aarch64_neon_vld1x2: 4348 case Intrinsic::aarch64_neon_vld1x3: 4349 case Intrinsic::aarch64_neon_vld1x4: 4350 case Intrinsic::arm_neon_vld2lane: 4351 case Intrinsic::arm_neon_vld3lane: 4352 case Intrinsic::arm_neon_vld4lane: { 4353 Info.opc = ISD::INTRINSIC_W_CHAIN; 4354 // Conservatively set memVT to the entire set of vectors loaded. 4355 uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; 4356 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 4357 Info.ptrVal = I.getArgOperand(0); 4358 Info.offset = 0; 4359 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 4360 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 4361 Info.vol = false; // volatile loads with NEON intrinsics not supported 4362 Info.readMem = true; 4363 Info.writeMem = false; 4364 return true; 4365 } 4366 case Intrinsic::arm_neon_vst1: 4367 case Intrinsic::arm_neon_vst2: 4368 case Intrinsic::arm_neon_vst3: 4369 case Intrinsic::arm_neon_vst4: 4370 case Intrinsic::aarch64_neon_vst1x2: 4371 case Intrinsic::aarch64_neon_vst1x3: 4372 case Intrinsic::aarch64_neon_vst1x4: 4373 case Intrinsic::arm_neon_vst2lane: 4374 case Intrinsic::arm_neon_vst3lane: 4375 case Intrinsic::arm_neon_vst4lane: { 4376 Info.opc = ISD::INTRINSIC_VOID; 4377 // Conservatively set memVT to the entire set of vectors stored. 4378 unsigned NumElts = 0; 4379 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 4380 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 4381 if (!ArgTy->isVectorTy()) 4382 break; 4383 NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; 4384 } 4385 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 4386 Info.ptrVal = I.getArgOperand(0); 4387 Info.offset = 0; 4388 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 4389 Info.align = cast<ConstantInt>(AlignArg)->getZExtValue(); 4390 Info.vol = false; // volatile stores with NEON intrinsics not supported 4391 Info.readMem = false; 4392 Info.writeMem = true; 4393 return true; 4394 } 4395 default: 4396 break; 4397 } 4398 4399 return false; 4400} 4401