1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that AArch64 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "aarch64-isel" 16#include "AArch64.h" 17#include "AArch64ISelLowering.h" 18#include "AArch64MachineFunctionInfo.h" 19#include "AArch64TargetMachine.h" 20#include "AArch64TargetObjectFile.h" 21#include "Utils/AArch64BaseInfo.h" 22#include "llvm/CodeGen/Analysis.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 28#include "llvm/IR/CallingConv.h" 29 30using namespace llvm; 31 32static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { 33 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 34 35 if (Subtarget->isTargetLinux()) 36 return new AArch64LinuxTargetObjectFile(); 37 if (Subtarget->isTargetELF()) 38 return new TargetLoweringObjectFileELF(); 39 llvm_unreachable("unknown subtarget type"); 40} 41 42AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) 43 : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { 44 45 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 46 47 // SIMD compares set the entire lane's bits to 1 48 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 49 50 // Scalar register <-> type mapping 51 addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); 52 addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); 53 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 54 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 55 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 56 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 57 58 if (Subtarget->hasNEON()) { 59 // And the vectors 60 addRegisterClass(MVT::v8i8, &AArch64::VPR64RegClass); 61 addRegisterClass(MVT::v4i16, &AArch64::VPR64RegClass); 62 addRegisterClass(MVT::v2i32, &AArch64::VPR64RegClass); 63 addRegisterClass(MVT::v1i64, &AArch64::VPR64RegClass); 64 addRegisterClass(MVT::v2f32, &AArch64::VPR64RegClass); 65 addRegisterClass(MVT::v16i8, &AArch64::VPR128RegClass); 66 addRegisterClass(MVT::v8i16, &AArch64::VPR128RegClass); 67 addRegisterClass(MVT::v4i32, &AArch64::VPR128RegClass); 68 addRegisterClass(MVT::v2i64, &AArch64::VPR128RegClass); 69 addRegisterClass(MVT::v4f32, &AArch64::VPR128RegClass); 70 addRegisterClass(MVT::v2f64, &AArch64::VPR128RegClass); 71 } 72 73 computeRegisterProperties(); 74 75 // We combine OR nodes for bitfield and NEON BSL operations. 76 setTargetDAGCombine(ISD::OR); 77 78 setTargetDAGCombine(ISD::AND); 79 setTargetDAGCombine(ISD::SRA); 80 81 // AArch64 does not have i1 loads, or much of anything for i1 really. 82 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 83 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 84 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 85 86 setStackPointerRegisterToSaveRestore(AArch64::XSP); 87 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 88 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 89 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 90 91 // We'll lower globals to wrappers for selection. 92 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 93 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 94 95 // A64 instructions have the comparison predicate attached to the user of the 96 // result, but having a separate comparison is valuable for matching. 97 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 98 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 99 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 100 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 101 102 setOperationAction(ISD::SELECT, MVT::i32, Custom); 103 setOperationAction(ISD::SELECT, MVT::i64, Custom); 104 setOperationAction(ISD::SELECT, MVT::f32, Custom); 105 setOperationAction(ISD::SELECT, MVT::f64, Custom); 106 107 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 108 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 109 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 110 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 111 112 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 113 114 setOperationAction(ISD::SETCC, MVT::i32, Custom); 115 setOperationAction(ISD::SETCC, MVT::i64, Custom); 116 setOperationAction(ISD::SETCC, MVT::f32, Custom); 117 setOperationAction(ISD::SETCC, MVT::f64, Custom); 118 119 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 120 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 121 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 122 123 setOperationAction(ISD::VASTART, MVT::Other, Custom); 124 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 125 setOperationAction(ISD::VAEND, MVT::Other, Expand); 126 setOperationAction(ISD::VAARG, MVT::Other, Expand); 127 128 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 129 130 setOperationAction(ISD::ROTL, MVT::i32, Expand); 131 setOperationAction(ISD::ROTL, MVT::i64, Expand); 132 133 setOperationAction(ISD::UREM, MVT::i32, Expand); 134 setOperationAction(ISD::UREM, MVT::i64, Expand); 135 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 136 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 137 138 setOperationAction(ISD::SREM, MVT::i32, Expand); 139 setOperationAction(ISD::SREM, MVT::i64, Expand); 140 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 141 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 142 143 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 144 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 145 146 // Legal floating-point operations. 147 setOperationAction(ISD::FABS, MVT::f32, Legal); 148 setOperationAction(ISD::FABS, MVT::f64, Legal); 149 150 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 151 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 152 153 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 154 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 155 156 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 157 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 158 159 setOperationAction(ISD::FNEG, MVT::f32, Legal); 160 setOperationAction(ISD::FNEG, MVT::f64, Legal); 161 162 setOperationAction(ISD::FRINT, MVT::f32, Legal); 163 setOperationAction(ISD::FRINT, MVT::f64, Legal); 164 165 setOperationAction(ISD::FSQRT, MVT::f32, Legal); 166 setOperationAction(ISD::FSQRT, MVT::f64, Legal); 167 168 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 169 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 170 171 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 172 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 173 setOperationAction(ISD::ConstantFP, MVT::f128, Legal); 174 175 // Illegal floating-point operations. 176 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 177 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 178 179 setOperationAction(ISD::FCOS, MVT::f32, Expand); 180 setOperationAction(ISD::FCOS, MVT::f64, Expand); 181 182 setOperationAction(ISD::FEXP, MVT::f32, Expand); 183 setOperationAction(ISD::FEXP, MVT::f64, Expand); 184 185 setOperationAction(ISD::FEXP2, MVT::f32, Expand); 186 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 187 188 setOperationAction(ISD::FLOG, MVT::f32, Expand); 189 setOperationAction(ISD::FLOG, MVT::f64, Expand); 190 191 setOperationAction(ISD::FLOG2, MVT::f32, Expand); 192 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 193 194 setOperationAction(ISD::FLOG10, MVT::f32, Expand); 195 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 196 197 setOperationAction(ISD::FPOW, MVT::f32, Expand); 198 setOperationAction(ISD::FPOW, MVT::f64, Expand); 199 200 setOperationAction(ISD::FPOWI, MVT::f32, Expand); 201 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 202 203 setOperationAction(ISD::FREM, MVT::f32, Expand); 204 setOperationAction(ISD::FREM, MVT::f64, Expand); 205 206 setOperationAction(ISD::FSIN, MVT::f32, Expand); 207 setOperationAction(ISD::FSIN, MVT::f64, Expand); 208 209 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 210 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 211 212 // Virtually no operation on f128 is legal, but LLVM can't expand them when 213 // there's a valid register class, so we need custom operations in most cases. 214 setOperationAction(ISD::FABS, MVT::f128, Expand); 215 setOperationAction(ISD::FADD, MVT::f128, Custom); 216 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 217 setOperationAction(ISD::FCOS, MVT::f128, Expand); 218 setOperationAction(ISD::FDIV, MVT::f128, Custom); 219 setOperationAction(ISD::FMA, MVT::f128, Expand); 220 setOperationAction(ISD::FMUL, MVT::f128, Custom); 221 setOperationAction(ISD::FNEG, MVT::f128, Expand); 222 setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); 223 setOperationAction(ISD::FP_ROUND, MVT::f128, Expand); 224 setOperationAction(ISD::FPOW, MVT::f128, Expand); 225 setOperationAction(ISD::FREM, MVT::f128, Expand); 226 setOperationAction(ISD::FRINT, MVT::f128, Expand); 227 setOperationAction(ISD::FSIN, MVT::f128, Expand); 228 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 229 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 230 setOperationAction(ISD::FSUB, MVT::f128, Custom); 231 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 232 setOperationAction(ISD::SETCC, MVT::f128, Custom); 233 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 234 setOperationAction(ISD::SELECT, MVT::f128, Expand); 235 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 236 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 237 238 // Lowering for many of the conversions is actually specified by the non-f128 239 // type. The LowerXXX function will be trivial when f128 isn't involved. 240 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 241 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 242 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 243 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 244 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 245 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 246 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 247 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 248 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 249 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 250 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 251 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 252 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 253 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 254 255 // This prevents LLVM trying to compress double constants into a floating 256 // constant-pool entry and trying to load from there. It's of doubtful benefit 257 // for A64: we'd need LDR followed by FCVT, I believe. 258 setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); 259 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 260 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 261 262 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 263 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 264 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 265 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 266 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 267 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 268 269 setExceptionPointerRegister(AArch64::X0); 270 setExceptionSelectorRegister(AArch64::X1); 271 272 if (Subtarget->hasNEON()) { 273 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 274 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); 275 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 276 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); 277 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 278 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); 279 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 280 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 281 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 282 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 283 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 284 285 setOperationAction(ISD::SETCC, MVT::v8i8, Custom); 286 setOperationAction(ISD::SETCC, MVT::v16i8, Custom); 287 setOperationAction(ISD::SETCC, MVT::v4i16, Custom); 288 setOperationAction(ISD::SETCC, MVT::v8i16, Custom); 289 setOperationAction(ISD::SETCC, MVT::v2i32, Custom); 290 setOperationAction(ISD::SETCC, MVT::v4i32, Custom); 291 setOperationAction(ISD::SETCC, MVT::v2i64, Custom); 292 setOperationAction(ISD::SETCC, MVT::v2f32, Custom); 293 setOperationAction(ISD::SETCC, MVT::v4f32, Custom); 294 setOperationAction(ISD::SETCC, MVT::v2f64, Custom); 295 } 296} 297 298EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 299 // It's reasonably important that this value matches the "natural" legal 300 // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself 301 // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). 302 if (!VT.isVector()) return MVT::i32; 303 return VT.changeVectorElementTypeToInteger(); 304} 305 306static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, 307 unsigned &LdrOpc, 308 unsigned &StrOpc) { 309 static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, 310 AArch64::LDXR_word, AArch64::LDXR_dword}; 311 static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, 312 AArch64::LDAXR_word, AArch64::LDAXR_dword}; 313 static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, 314 AArch64::STXR_word, AArch64::STXR_dword}; 315 static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword, 316 AArch64::STLXR_word, AArch64::STLXR_dword}; 317 318 const unsigned *LoadOps, *StoreOps; 319 if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) 320 LoadOps = LoadAcqs; 321 else 322 LoadOps = LoadBares; 323 324 if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) 325 StoreOps = StoreRels; 326 else 327 StoreOps = StoreBares; 328 329 assert(isPowerOf2_32(Size) && Size <= 8 && 330 "unsupported size for atomic binary op!"); 331 332 LdrOpc = LoadOps[Log2_32(Size)]; 333 StrOpc = StoreOps[Log2_32(Size)]; 334} 335 336MachineBasicBlock * 337AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, 338 unsigned Size, 339 unsigned BinOpcode) const { 340 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 341 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 342 343 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 344 MachineFunction *MF = BB->getParent(); 345 MachineFunction::iterator It = BB; 346 ++It; 347 348 unsigned dest = MI->getOperand(0).getReg(); 349 unsigned ptr = MI->getOperand(1).getReg(); 350 unsigned incr = MI->getOperand(2).getReg(); 351 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 352 DebugLoc dl = MI->getDebugLoc(); 353 354 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 355 356 unsigned ldrOpc, strOpc; 357 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 358 359 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 360 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 361 MF->insert(It, loopMBB); 362 MF->insert(It, exitMBB); 363 364 // Transfer the remainder of BB and its successor edges to exitMBB. 365 exitMBB->splice(exitMBB->begin(), BB, 366 llvm::next(MachineBasicBlock::iterator(MI)), 367 BB->end()); 368 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 369 370 const TargetRegisterClass *TRC 371 = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; 372 unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); 373 374 // thisMBB: 375 // ... 376 // fallthrough --> loopMBB 377 BB->addSuccessor(loopMBB); 378 379 // loopMBB: 380 // ldxr dest, ptr 381 // <binop> scratch, dest, incr 382 // stxr stxr_status, scratch, ptr 383 // cbnz stxr_status, loopMBB 384 // fallthrough --> exitMBB 385 BB = loopMBB; 386 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 387 if (BinOpcode) { 388 // All arithmetic operations we'll be creating are designed to take an extra 389 // shift or extend operand, which we can conveniently set to zero. 390 391 // Operand order needs to go the other way for NAND. 392 if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl) 393 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 394 .addReg(incr).addReg(dest).addImm(0); 395 else 396 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 397 .addReg(dest).addReg(incr).addImm(0); 398 } 399 400 // From the stxr, the register is GPR32; from the cmp it's GPR32wsp 401 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 402 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 403 404 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr); 405 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 406 .addReg(stxr_status).addMBB(loopMBB); 407 408 BB->addSuccessor(loopMBB); 409 BB->addSuccessor(exitMBB); 410 411 // exitMBB: 412 // ... 413 BB = exitMBB; 414 415 MI->eraseFromParent(); // The instruction is gone now. 416 417 return BB; 418} 419 420MachineBasicBlock * 421AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI, 422 MachineBasicBlock *BB, 423 unsigned Size, 424 unsigned CmpOp, 425 A64CC::CondCodes Cond) const { 426 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 427 428 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 429 MachineFunction *MF = BB->getParent(); 430 MachineFunction::iterator It = BB; 431 ++It; 432 433 unsigned dest = MI->getOperand(0).getReg(); 434 unsigned ptr = MI->getOperand(1).getReg(); 435 unsigned incr = MI->getOperand(2).getReg(); 436 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 437 438 unsigned oldval = dest; 439 DebugLoc dl = MI->getDebugLoc(); 440 441 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 442 const TargetRegisterClass *TRC, *TRCsp; 443 if (Size == 8) { 444 TRC = &AArch64::GPR64RegClass; 445 TRCsp = &AArch64::GPR64xspRegClass; 446 } else { 447 TRC = &AArch64::GPR32RegClass; 448 TRCsp = &AArch64::GPR32wspRegClass; 449 } 450 451 unsigned ldrOpc, strOpc; 452 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 453 454 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 455 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 456 MF->insert(It, loopMBB); 457 MF->insert(It, exitMBB); 458 459 // Transfer the remainder of BB and its successor edges to exitMBB. 460 exitMBB->splice(exitMBB->begin(), BB, 461 llvm::next(MachineBasicBlock::iterator(MI)), 462 BB->end()); 463 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 464 465 unsigned scratch = MRI.createVirtualRegister(TRC); 466 MRI.constrainRegClass(scratch, TRCsp); 467 468 // thisMBB: 469 // ... 470 // fallthrough --> loopMBB 471 BB->addSuccessor(loopMBB); 472 473 // loopMBB: 474 // ldxr dest, ptr 475 // cmp incr, dest (, sign extend if necessary) 476 // csel scratch, dest, incr, cond 477 // stxr stxr_status, scratch, ptr 478 // cbnz stxr_status, loopMBB 479 // fallthrough --> exitMBB 480 BB = loopMBB; 481 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 482 483 // Build compare and cmov instructions. 484 MRI.constrainRegClass(incr, TRCsp); 485 BuildMI(BB, dl, TII->get(CmpOp)) 486 .addReg(incr).addReg(oldval).addImm(0); 487 488 BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc), 489 scratch) 490 .addReg(oldval).addReg(incr).addImm(Cond); 491 492 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 493 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 494 495 BuildMI(BB, dl, TII->get(strOpc), stxr_status) 496 .addReg(scratch).addReg(ptr); 497 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 498 .addReg(stxr_status).addMBB(loopMBB); 499 500 BB->addSuccessor(loopMBB); 501 BB->addSuccessor(exitMBB); 502 503 // exitMBB: 504 // ... 505 BB = exitMBB; 506 507 MI->eraseFromParent(); // The instruction is gone now. 508 509 return BB; 510} 511 512MachineBasicBlock * 513AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI, 514 MachineBasicBlock *BB, 515 unsigned Size) const { 516 unsigned dest = MI->getOperand(0).getReg(); 517 unsigned ptr = MI->getOperand(1).getReg(); 518 unsigned oldval = MI->getOperand(2).getReg(); 519 unsigned newval = MI->getOperand(3).getReg(); 520 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm()); 521 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 522 DebugLoc dl = MI->getDebugLoc(); 523 524 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 525 const TargetRegisterClass *TRCsp; 526 TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass; 527 528 unsigned ldrOpc, strOpc; 529 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 530 531 MachineFunction *MF = BB->getParent(); 532 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 533 MachineFunction::iterator It = BB; 534 ++It; // insert the new blocks after the current block 535 536 MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); 537 MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); 538 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 539 MF->insert(It, loop1MBB); 540 MF->insert(It, loop2MBB); 541 MF->insert(It, exitMBB); 542 543 // Transfer the remainder of BB and its successor edges to exitMBB. 544 exitMBB->splice(exitMBB->begin(), BB, 545 llvm::next(MachineBasicBlock::iterator(MI)), 546 BB->end()); 547 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 548 549 // thisMBB: 550 // ... 551 // fallthrough --> loop1MBB 552 BB->addSuccessor(loop1MBB); 553 554 // loop1MBB: 555 // ldxr dest, [ptr] 556 // cmp dest, oldval 557 // b.ne exitMBB 558 BB = loop1MBB; 559 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 560 561 unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl; 562 MRI.constrainRegClass(dest, TRCsp); 563 BuildMI(BB, dl, TII->get(CmpOp)) 564 .addReg(dest).addReg(oldval).addImm(0); 565 BuildMI(BB, dl, TII->get(AArch64::Bcc)) 566 .addImm(A64CC::NE).addMBB(exitMBB); 567 BB->addSuccessor(loop2MBB); 568 BB->addSuccessor(exitMBB); 569 570 // loop2MBB: 571 // strex stxr_status, newval, [ptr] 572 // cbnz stxr_status, loop1MBB 573 BB = loop2MBB; 574 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 575 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 576 577 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr); 578 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 579 .addReg(stxr_status).addMBB(loop1MBB); 580 BB->addSuccessor(loop1MBB); 581 BB->addSuccessor(exitMBB); 582 583 // exitMBB: 584 // ... 585 BB = exitMBB; 586 587 MI->eraseFromParent(); // The instruction is gone now. 588 589 return BB; 590} 591 592MachineBasicBlock * 593AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 594 MachineBasicBlock *MBB) const { 595 // We materialise the F128CSEL pseudo-instruction using conditional branches 596 // and loads, giving an instruciton sequence like: 597 // str q0, [sp] 598 // b.ne IfTrue 599 // b Finish 600 // IfTrue: 601 // str q1, [sp] 602 // Finish: 603 // ldr q0, [sp] 604 // 605 // Using virtual registers would probably not be beneficial since COPY 606 // instructions are expensive for f128 (there's no actual instruction to 607 // implement them). 608 // 609 // An alternative would be to do an integer-CSEL on some address. E.g.: 610 // mov x0, sp 611 // add x1, sp, #16 612 // str q0, [x0] 613 // str q1, [x1] 614 // csel x0, x0, x1, ne 615 // ldr q0, [x0] 616 // 617 // It's unclear which approach is actually optimal. 618 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 619 MachineFunction *MF = MBB->getParent(); 620 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 621 DebugLoc DL = MI->getDebugLoc(); 622 MachineFunction::iterator It = MBB; 623 ++It; 624 625 unsigned DestReg = MI->getOperand(0).getReg(); 626 unsigned IfTrueReg = MI->getOperand(1).getReg(); 627 unsigned IfFalseReg = MI->getOperand(2).getReg(); 628 unsigned CondCode = MI->getOperand(3).getImm(); 629 bool NZCVKilled = MI->getOperand(4).isKill(); 630 631 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 632 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 633 MF->insert(It, TrueBB); 634 MF->insert(It, EndBB); 635 636 // Transfer rest of current basic-block to EndBB 637 EndBB->splice(EndBB->begin(), MBB, 638 llvm::next(MachineBasicBlock::iterator(MI)), 639 MBB->end()); 640 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 641 642 // We need somewhere to store the f128 value needed. 643 int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16); 644 645 // [... start of incoming MBB ...] 646 // str qIFFALSE, [sp] 647 // b.cc IfTrue 648 // b Done 649 BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR)) 650 .addReg(IfFalseReg) 651 .addFrameIndex(ScratchFI) 652 .addImm(0); 653 BuildMI(MBB, DL, TII->get(AArch64::Bcc)) 654 .addImm(CondCode) 655 .addMBB(TrueBB); 656 BuildMI(MBB, DL, TII->get(AArch64::Bimm)) 657 .addMBB(EndBB); 658 MBB->addSuccessor(TrueBB); 659 MBB->addSuccessor(EndBB); 660 661 // IfTrue: 662 // str qIFTRUE, [sp] 663 BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) 664 .addReg(IfTrueReg) 665 .addFrameIndex(ScratchFI) 666 .addImm(0); 667 668 // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the 669 // blocks. 670 TrueBB->addSuccessor(EndBB); 671 672 // Done: 673 // ldr qDEST, [sp] 674 // [... rest of incoming MBB ...] 675 if (!NZCVKilled) 676 EndBB->addLiveIn(AArch64::NZCV); 677 MachineInstr *StartOfEnd = EndBB->begin(); 678 BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) 679 .addFrameIndex(ScratchFI) 680 .addImm(0); 681 682 MI->eraseFromParent(); 683 return EndBB; 684} 685 686MachineBasicBlock * 687AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 688 MachineBasicBlock *MBB) const { 689 switch (MI->getOpcode()) { 690 default: llvm_unreachable("Unhandled instruction with custom inserter"); 691 case AArch64::F128CSEL: 692 return EmitF128CSEL(MI, MBB); 693 case AArch64::ATOMIC_LOAD_ADD_I8: 694 return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl); 695 case AArch64::ATOMIC_LOAD_ADD_I16: 696 return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl); 697 case AArch64::ATOMIC_LOAD_ADD_I32: 698 return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl); 699 case AArch64::ATOMIC_LOAD_ADD_I64: 700 return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl); 701 702 case AArch64::ATOMIC_LOAD_SUB_I8: 703 return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl); 704 case AArch64::ATOMIC_LOAD_SUB_I16: 705 return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl); 706 case AArch64::ATOMIC_LOAD_SUB_I32: 707 return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl); 708 case AArch64::ATOMIC_LOAD_SUB_I64: 709 return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl); 710 711 case AArch64::ATOMIC_LOAD_AND_I8: 712 return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl); 713 case AArch64::ATOMIC_LOAD_AND_I16: 714 return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl); 715 case AArch64::ATOMIC_LOAD_AND_I32: 716 return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl); 717 case AArch64::ATOMIC_LOAD_AND_I64: 718 return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl); 719 720 case AArch64::ATOMIC_LOAD_OR_I8: 721 return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl); 722 case AArch64::ATOMIC_LOAD_OR_I16: 723 return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl); 724 case AArch64::ATOMIC_LOAD_OR_I32: 725 return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl); 726 case AArch64::ATOMIC_LOAD_OR_I64: 727 return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl); 728 729 case AArch64::ATOMIC_LOAD_XOR_I8: 730 return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl); 731 case AArch64::ATOMIC_LOAD_XOR_I16: 732 return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl); 733 case AArch64::ATOMIC_LOAD_XOR_I32: 734 return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl); 735 case AArch64::ATOMIC_LOAD_XOR_I64: 736 return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl); 737 738 case AArch64::ATOMIC_LOAD_NAND_I8: 739 return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl); 740 case AArch64::ATOMIC_LOAD_NAND_I16: 741 return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl); 742 case AArch64::ATOMIC_LOAD_NAND_I32: 743 return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl); 744 case AArch64::ATOMIC_LOAD_NAND_I64: 745 return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl); 746 747 case AArch64::ATOMIC_LOAD_MIN_I8: 748 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT); 749 case AArch64::ATOMIC_LOAD_MIN_I16: 750 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT); 751 case AArch64::ATOMIC_LOAD_MIN_I32: 752 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT); 753 case AArch64::ATOMIC_LOAD_MIN_I64: 754 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT); 755 756 case AArch64::ATOMIC_LOAD_MAX_I8: 757 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT); 758 case AArch64::ATOMIC_LOAD_MAX_I16: 759 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT); 760 case AArch64::ATOMIC_LOAD_MAX_I32: 761 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT); 762 case AArch64::ATOMIC_LOAD_MAX_I64: 763 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT); 764 765 case AArch64::ATOMIC_LOAD_UMIN_I8: 766 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI); 767 case AArch64::ATOMIC_LOAD_UMIN_I16: 768 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI); 769 case AArch64::ATOMIC_LOAD_UMIN_I32: 770 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI); 771 case AArch64::ATOMIC_LOAD_UMIN_I64: 772 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI); 773 774 case AArch64::ATOMIC_LOAD_UMAX_I8: 775 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO); 776 case AArch64::ATOMIC_LOAD_UMAX_I16: 777 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO); 778 case AArch64::ATOMIC_LOAD_UMAX_I32: 779 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO); 780 case AArch64::ATOMIC_LOAD_UMAX_I64: 781 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO); 782 783 case AArch64::ATOMIC_SWAP_I8: 784 return emitAtomicBinary(MI, MBB, 1, 0); 785 case AArch64::ATOMIC_SWAP_I16: 786 return emitAtomicBinary(MI, MBB, 2, 0); 787 case AArch64::ATOMIC_SWAP_I32: 788 return emitAtomicBinary(MI, MBB, 4, 0); 789 case AArch64::ATOMIC_SWAP_I64: 790 return emitAtomicBinary(MI, MBB, 8, 0); 791 792 case AArch64::ATOMIC_CMP_SWAP_I8: 793 return emitAtomicCmpSwap(MI, MBB, 1); 794 case AArch64::ATOMIC_CMP_SWAP_I16: 795 return emitAtomicCmpSwap(MI, MBB, 2); 796 case AArch64::ATOMIC_CMP_SWAP_I32: 797 return emitAtomicCmpSwap(MI, MBB, 4); 798 case AArch64::ATOMIC_CMP_SWAP_I64: 799 return emitAtomicCmpSwap(MI, MBB, 8); 800 } 801} 802 803 804const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 805 switch (Opcode) { 806 case AArch64ISD::BR_CC: return "AArch64ISD::BR_CC"; 807 case AArch64ISD::Call: return "AArch64ISD::Call"; 808 case AArch64ISD::FPMOV: return "AArch64ISD::FPMOV"; 809 case AArch64ISD::GOTLoad: return "AArch64ISD::GOTLoad"; 810 case AArch64ISD::BFI: return "AArch64ISD::BFI"; 811 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 812 case AArch64ISD::Ret: return "AArch64ISD::Ret"; 813 case AArch64ISD::SBFX: return "AArch64ISD::SBFX"; 814 case AArch64ISD::SELECT_CC: return "AArch64ISD::SELECT_CC"; 815 case AArch64ISD::SETCC: return "AArch64ISD::SETCC"; 816 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 817 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 818 case AArch64ISD::TLSDESCCALL: return "AArch64ISD::TLSDESCCALL"; 819 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 820 case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; 821 822 case AArch64ISD::NEON_BSL: 823 return "AArch64ISD::NEON_BSL"; 824 case AArch64ISD::NEON_MOVIMM: 825 return "AArch64ISD::NEON_MOVIMM"; 826 case AArch64ISD::NEON_MVNIMM: 827 return "AArch64ISD::NEON_MVNIMM"; 828 case AArch64ISD::NEON_FMOVIMM: 829 return "AArch64ISD::NEON_FMOVIMM"; 830 case AArch64ISD::NEON_CMP: 831 return "AArch64ISD::NEON_CMP"; 832 case AArch64ISD::NEON_CMPZ: 833 return "AArch64ISD::NEON_CMPZ"; 834 case AArch64ISD::NEON_TST: 835 return "AArch64ISD::NEON_TST"; 836 default: 837 return NULL; 838 } 839} 840 841static const uint16_t AArch64FPRArgRegs[] = { 842 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 843 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7 844}; 845static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs); 846 847static const uint16_t AArch64ArgRegs[] = { 848 AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, 849 AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 850}; 851static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs); 852 853static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT, 854 CCValAssign::LocInfo LocInfo, 855 ISD::ArgFlagsTy ArgFlags, CCState &State) { 856 // Mark all remaining general purpose registers as allocated. We don't 857 // backtrack: if (for example) an i128 gets put on the stack, no subsequent 858 // i64 will go in registers (C.11). 859 for (unsigned i = 0; i < NumArgRegs; ++i) 860 State.AllocateReg(AArch64ArgRegs[i]); 861 862 return false; 863} 864 865#include "AArch64GenCallingConv.inc" 866 867CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 868 869 switch(CC) { 870 default: llvm_unreachable("Unsupported calling convention"); 871 case CallingConv::Fast: 872 case CallingConv::C: 873 return CC_A64_APCS; 874 } 875} 876 877void 878AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, 879 SDLoc DL, SDValue &Chain) const { 880 MachineFunction &MF = DAG.getMachineFunction(); 881 MachineFrameInfo *MFI = MF.getFrameInfo(); 882 AArch64MachineFunctionInfo *FuncInfo 883 = MF.getInfo<AArch64MachineFunctionInfo>(); 884 885 SmallVector<SDValue, 8> MemOps; 886 887 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs, 888 NumArgRegs); 889 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs, 890 NumFPRArgRegs); 891 892 unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR); 893 int GPRIdx = 0; 894 if (GPRSaveSize != 0) { 895 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 896 897 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 898 899 for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) { 900 unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass); 901 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 902 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 903 MachinePointerInfo::getStack(i * 8), 904 false, false, 0); 905 MemOps.push_back(Store); 906 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 907 DAG.getConstant(8, getPointerTy())); 908 } 909 } 910 911 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 912 int FPRIdx = 0; 913 if (FPRSaveSize != 0) { 914 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 915 916 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 917 918 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 919 unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], 920 &AArch64::FPR128RegClass); 921 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 922 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 923 MachinePointerInfo::getStack(i * 16), 924 false, false, 0); 925 MemOps.push_back(Store); 926 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 927 DAG.getConstant(16, getPointerTy())); 928 } 929 } 930 931 int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); 932 933 FuncInfo->setVariadicStackIdx(StackIdx); 934 FuncInfo->setVariadicGPRIdx(GPRIdx); 935 FuncInfo->setVariadicGPRSize(GPRSaveSize); 936 FuncInfo->setVariadicFPRIdx(FPRIdx); 937 FuncInfo->setVariadicFPRSize(FPRSaveSize); 938 939 if (!MemOps.empty()) { 940 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 941 MemOps.size()); 942 } 943} 944 945 946SDValue 947AArch64TargetLowering::LowerFormalArguments(SDValue Chain, 948 CallingConv::ID CallConv, bool isVarArg, 949 const SmallVectorImpl<ISD::InputArg> &Ins, 950 SDLoc dl, SelectionDAG &DAG, 951 SmallVectorImpl<SDValue> &InVals) const { 952 MachineFunction &MF = DAG.getMachineFunction(); 953 AArch64MachineFunctionInfo *FuncInfo 954 = MF.getInfo<AArch64MachineFunctionInfo>(); 955 MachineFrameInfo *MFI = MF.getFrameInfo(); 956 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 957 958 SmallVector<CCValAssign, 16> ArgLocs; 959 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 960 getTargetMachine(), ArgLocs, *DAG.getContext()); 961 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 962 963 SmallVector<SDValue, 16> ArgValues; 964 965 SDValue ArgValue; 966 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 967 CCValAssign &VA = ArgLocs[i]; 968 ISD::ArgFlagsTy Flags = Ins[i].Flags; 969 970 if (Flags.isByVal()) { 971 // Byval is used for small structs and HFAs in the PCS, but the system 972 // should work in a non-compliant manner for larger structs. 973 EVT PtrTy = getPointerTy(); 974 int Size = Flags.getByValSize(); 975 unsigned NumRegs = (Size + 7) / 8; 976 977 unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, 978 VA.getLocMemOffset(), 979 false); 980 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 981 InVals.push_back(FrameIdxN); 982 983 continue; 984 } else if (VA.isRegLoc()) { 985 MVT RegVT = VA.getLocVT(); 986 const TargetRegisterClass *RC = getRegClassFor(RegVT); 987 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 988 989 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 990 } else { // VA.isRegLoc() 991 assert(VA.isMemLoc()); 992 993 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 994 VA.getLocMemOffset(), true); 995 996 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 997 ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, 998 MachinePointerInfo::getFixedStack(FI), 999 false, false, false, 0); 1000 1001 1002 } 1003 1004 switch (VA.getLocInfo()) { 1005 default: llvm_unreachable("Unknown loc info!"); 1006 case CCValAssign::Full: break; 1007 case CCValAssign::BCvt: 1008 ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue); 1009 break; 1010 case CCValAssign::SExt: 1011 case CCValAssign::ZExt: 1012 case CCValAssign::AExt: { 1013 unsigned DestSize = VA.getValVT().getSizeInBits(); 1014 unsigned DestSubReg; 1015 1016 switch (DestSize) { 1017 case 8: DestSubReg = AArch64::sub_8; break; 1018 case 16: DestSubReg = AArch64::sub_16; break; 1019 case 32: DestSubReg = AArch64::sub_32; break; 1020 case 64: DestSubReg = AArch64::sub_64; break; 1021 default: llvm_unreachable("Unexpected argument promotion"); 1022 } 1023 1024 ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, 1025 VA.getValVT(), ArgValue, 1026 DAG.getTargetConstant(DestSubReg, MVT::i32)), 1027 0); 1028 break; 1029 } 1030 } 1031 1032 InVals.push_back(ArgValue); 1033 } 1034 1035 if (isVarArg) 1036 SaveVarArgRegisters(CCInfo, DAG, dl, Chain); 1037 1038 unsigned StackArgSize = CCInfo.getNextStackOffset(); 1039 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 1040 // This is a non-standard ABI so by fiat I say we're allowed to make full 1041 // use of the stack area to be popped, which must be aligned to 16 bytes in 1042 // any case: 1043 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 1044 1045 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 1046 // a multiple of 16. 1047 FuncInfo->setArgumentStackToRestore(StackArgSize); 1048 1049 // This realignment carries over to the available bytes below. Our own 1050 // callers will guarantee the space is free by giving an aligned value to 1051 // CALLSEQ_START. 1052 } 1053 // Even if we're not expected to free up the space, it's useful to know how 1054 // much is there while considering tail calls (because we can reuse it). 1055 FuncInfo->setBytesInStackArgArea(StackArgSize); 1056 1057 return Chain; 1058} 1059 1060SDValue 1061AArch64TargetLowering::LowerReturn(SDValue Chain, 1062 CallingConv::ID CallConv, bool isVarArg, 1063 const SmallVectorImpl<ISD::OutputArg> &Outs, 1064 const SmallVectorImpl<SDValue> &OutVals, 1065 SDLoc dl, SelectionDAG &DAG) const { 1066 // CCValAssign - represent the assignment of the return value to a location. 1067 SmallVector<CCValAssign, 16> RVLocs; 1068 1069 // CCState - Info about the registers and stack slots. 1070 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1071 getTargetMachine(), RVLocs, *DAG.getContext()); 1072 1073 // Analyze outgoing return values. 1074 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv)); 1075 1076 SDValue Flag; 1077 SmallVector<SDValue, 4> RetOps(1, Chain); 1078 1079 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1080 // PCS: "If the type, T, of the result of a function is such that 1081 // void func(T arg) would require that arg be passed as a value in a 1082 // register (or set of registers) according to the rules in 5.4, then the 1083 // result is returned in the same registers as would be used for such an 1084 // argument. 1085 // 1086 // Otherwise, the caller shall reserve a block of memory of sufficient 1087 // size and alignment to hold the result. The address of the memory block 1088 // shall be passed as an additional argument to the function in x8." 1089 // 1090 // This is implemented in two places. The register-return values are dealt 1091 // with here, more complex returns are passed as an sret parameter, which 1092 // means we don't have to worry about it during actual return. 1093 CCValAssign &VA = RVLocs[i]; 1094 assert(VA.isRegLoc() && "Only register-returns should be created by PCS"); 1095 1096 1097 SDValue Arg = OutVals[i]; 1098 1099 // There's no convenient note in the ABI about this as there is for normal 1100 // arguments, but it says return values are passed in the same registers as 1101 // an argument would be. I believe that includes the comments about 1102 // unspecified higher bits, putting the burden of widening on the *caller* 1103 // for return values. 1104 switch (VA.getLocInfo()) { 1105 default: llvm_unreachable("Unknown loc info"); 1106 case CCValAssign::Full: break; 1107 case CCValAssign::SExt: 1108 case CCValAssign::ZExt: 1109 case CCValAssign::AExt: 1110 // Floating-point values should only be extended when they're going into 1111 // memory, which can't happen here so an integer extend is acceptable. 1112 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1113 break; 1114 case CCValAssign::BCvt: 1115 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1116 break; 1117 } 1118 1119 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 1120 Flag = Chain.getValue(1); 1121 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1122 } 1123 1124 RetOps[0] = Chain; // Update chain. 1125 1126 // Add the flag if we have it. 1127 if (Flag.getNode()) 1128 RetOps.push_back(Flag); 1129 1130 return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other, 1131 &RetOps[0], RetOps.size()); 1132} 1133 1134SDValue 1135AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 1136 SmallVectorImpl<SDValue> &InVals) const { 1137 SelectionDAG &DAG = CLI.DAG; 1138 SDLoc &dl = CLI.DL; 1139 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1140 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1141 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1142 SDValue Chain = CLI.Chain; 1143 SDValue Callee = CLI.Callee; 1144 bool &IsTailCall = CLI.IsTailCall; 1145 CallingConv::ID CallConv = CLI.CallConv; 1146 bool IsVarArg = CLI.IsVarArg; 1147 1148 MachineFunction &MF = DAG.getMachineFunction(); 1149 AArch64MachineFunctionInfo *FuncInfo 1150 = MF.getInfo<AArch64MachineFunctionInfo>(); 1151 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 1152 bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet(); 1153 bool IsSibCall = false; 1154 1155 if (IsTailCall) { 1156 IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1157 IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1158 Outs, OutVals, Ins, DAG); 1159 1160 // A sibling call is one where we're under the usual C ABI and not planning 1161 // to change that but can still do a tail call: 1162 if (!TailCallOpt && IsTailCall) 1163 IsSibCall = true; 1164 } 1165 1166 SmallVector<CCValAssign, 16> ArgLocs; 1167 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1168 getTargetMachine(), ArgLocs, *DAG.getContext()); 1169 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1170 1171 // On AArch64 (and all other architectures I'm aware of) the most this has to 1172 // do is adjust the stack pointer. 1173 unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16); 1174 if (IsSibCall) { 1175 // Since we're not changing the ABI to make this a tail call, the memory 1176 // operands are already available in the caller's incoming argument space. 1177 NumBytes = 0; 1178 } 1179 1180 // FPDiff is the byte offset of the call's argument area from the callee's. 1181 // Stores to callee stack arguments will be placed in FixedStackSlots offset 1182 // by this amount for a tail call. In a sibling call it must be 0 because the 1183 // caller will deallocate the entire stack and the callee still expects its 1184 // arguments to begin at SP+0. Completely unused for non-tail calls. 1185 int FPDiff = 0; 1186 1187 if (IsTailCall && !IsSibCall) { 1188 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 1189 1190 // FPDiff will be negative if this tail call requires more space than we 1191 // would automatically have in our incoming argument space. Positive if we 1192 // can actually shrink the stack. 1193 FPDiff = NumReusableBytes - NumBytes; 1194 1195 // The stack pointer must be 16-byte aligned at all times it's used for a 1196 // memory operation, which in practice means at *all* times and in 1197 // particular across call boundaries. Therefore our own arguments started at 1198 // a 16-byte aligned SP and the delta applied for the tail call should 1199 // satisfy the same constraint. 1200 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 1201 } 1202 1203 if (!IsSibCall) 1204 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 1205 dl); 1206 1207 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, 1208 getPointerTy()); 1209 1210 SmallVector<SDValue, 8> MemOpChains; 1211 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1212 1213 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1214 CCValAssign &VA = ArgLocs[i]; 1215 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1216 SDValue Arg = OutVals[i]; 1217 1218 // Callee does the actual widening, so all extensions just use an implicit 1219 // definition of the rest of the Loc. Aesthetically, this would be nicer as 1220 // an ANY_EXTEND, but that isn't valid for floating-point types and this 1221 // alternative works on integer types too. 1222 switch (VA.getLocInfo()) { 1223 default: llvm_unreachable("Unknown loc info!"); 1224 case CCValAssign::Full: break; 1225 case CCValAssign::SExt: 1226 case CCValAssign::ZExt: 1227 case CCValAssign::AExt: { 1228 unsigned SrcSize = VA.getValVT().getSizeInBits(); 1229 unsigned SrcSubReg; 1230 1231 switch (SrcSize) { 1232 case 8: SrcSubReg = AArch64::sub_8; break; 1233 case 16: SrcSubReg = AArch64::sub_16; break; 1234 case 32: SrcSubReg = AArch64::sub_32; break; 1235 case 64: SrcSubReg = AArch64::sub_64; break; 1236 default: llvm_unreachable("Unexpected argument promotion"); 1237 } 1238 1239 Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, 1240 VA.getLocVT(), 1241 DAG.getUNDEF(VA.getLocVT()), 1242 Arg, 1243 DAG.getTargetConstant(SrcSubReg, MVT::i32)), 1244 0); 1245 1246 break; 1247 } 1248 case CCValAssign::BCvt: 1249 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1250 break; 1251 } 1252 1253 if (VA.isRegLoc()) { 1254 // A normal register (sub-) argument. For now we just note it down because 1255 // we want to copy things into registers as late as possible to avoid 1256 // register-pressure (and possibly worse). 1257 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1258 continue; 1259 } 1260 1261 assert(VA.isMemLoc() && "unexpected argument location"); 1262 1263 SDValue DstAddr; 1264 MachinePointerInfo DstInfo; 1265 if (IsTailCall) { 1266 uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() : 1267 VA.getLocVT().getSizeInBits(); 1268 OpSize = (OpSize + 7) / 8; 1269 int32_t Offset = VA.getLocMemOffset() + FPDiff; 1270 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 1271 1272 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 1273 DstInfo = MachinePointerInfo::getFixedStack(FI); 1274 1275 // Make sure any stack arguments overlapping with where we're storing are 1276 // loaded before this eventual operation. Otherwise they'll be clobbered. 1277 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 1278 } else { 1279 SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1280 1281 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1282 DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset()); 1283 } 1284 1285 if (Flags.isByVal()) { 1286 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64); 1287 SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode, 1288 Flags.getByValAlign(), 1289 /*isVolatile = */ false, 1290 /*alwaysInline = */ false, 1291 DstInfo, MachinePointerInfo(0)); 1292 MemOpChains.push_back(Cpy); 1293 } else { 1294 // Normal stack argument, put it where it's needed. 1295 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo, 1296 false, false, 0); 1297 MemOpChains.push_back(Store); 1298 } 1299 } 1300 1301 // The loads and stores generated above shouldn't clash with each 1302 // other. Combining them with this TokenFactor notes that fact for the rest of 1303 // the backend. 1304 if (!MemOpChains.empty()) 1305 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1306 &MemOpChains[0], MemOpChains.size()); 1307 1308 // Most of the rest of the instructions need to be glued together; we don't 1309 // want assignments to actual registers used by a call to be rearranged by a 1310 // well-meaning scheduler. 1311 SDValue InFlag; 1312 1313 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1314 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1315 RegsToPass[i].second, InFlag); 1316 InFlag = Chain.getValue(1); 1317 } 1318 1319 // The linker is responsible for inserting veneers when necessary to put a 1320 // function call destination in range, so we don't need to bother with a 1321 // wrapper here. 1322 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1323 const GlobalValue *GV = G->getGlobal(); 1324 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); 1325 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1326 const char *Sym = S->getSymbol(); 1327 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 1328 } 1329 1330 // We don't usually want to end the call-sequence here because we would tidy 1331 // the frame up *after* the call, however in the ABI-changing tail-call case 1332 // we've carefully laid out the parameters so that when sp is reset they'll be 1333 // in the correct location. 1334 if (IsTailCall && !IsSibCall) { 1335 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1336 DAG.getIntPtrConstant(0, true), InFlag, dl); 1337 InFlag = Chain.getValue(1); 1338 } 1339 1340 // We produce the following DAG scheme for the actual call instruction: 1341 // (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag? 1342 // 1343 // Most arguments aren't going to be used and just keep the values live as 1344 // far as LLVM is concerned. It's expected to be selected as simply "bl 1345 // callee" (for a direct, non-tail call). 1346 std::vector<SDValue> Ops; 1347 Ops.push_back(Chain); 1348 Ops.push_back(Callee); 1349 1350 if (IsTailCall) { 1351 // Each tail call may have to adjust the stack by a different amount, so 1352 // this information must travel along with the operation for eventual 1353 // consumption by emitEpilogue. 1354 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 1355 } 1356 1357 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1358 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1359 RegsToPass[i].second.getValueType())); 1360 1361 1362 // Add a register mask operand representing the call-preserved registers. This 1363 // is used later in codegen to constrain register-allocation. 1364 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1365 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 1366 assert(Mask && "Missing call preserved mask for calling convention"); 1367 Ops.push_back(DAG.getRegisterMask(Mask)); 1368 1369 // If we needed glue, put it in as the last argument. 1370 if (InFlag.getNode()) 1371 Ops.push_back(InFlag); 1372 1373 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1374 1375 if (IsTailCall) { 1376 return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 1377 } 1378 1379 Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size()); 1380 InFlag = Chain.getValue(1); 1381 1382 // Now we can reclaim the stack, just as well do it before working out where 1383 // our return value is. 1384 if (!IsSibCall) { 1385 uint64_t CalleePopBytes 1386 = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0; 1387 1388 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1389 DAG.getIntPtrConstant(CalleePopBytes, true), 1390 InFlag, dl); 1391 InFlag = Chain.getValue(1); 1392 } 1393 1394 return LowerCallResult(Chain, InFlag, CallConv, 1395 IsVarArg, Ins, dl, DAG, InVals); 1396} 1397 1398SDValue 1399AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1400 CallingConv::ID CallConv, bool IsVarArg, 1401 const SmallVectorImpl<ISD::InputArg> &Ins, 1402 SDLoc dl, SelectionDAG &DAG, 1403 SmallVectorImpl<SDValue> &InVals) const { 1404 // Assign locations to each value returned by this call. 1405 SmallVector<CCValAssign, 16> RVLocs; 1406 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1407 getTargetMachine(), RVLocs, *DAG.getContext()); 1408 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv)); 1409 1410 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1411 CCValAssign VA = RVLocs[i]; 1412 1413 // Return values that are too big to fit into registers should use an sret 1414 // pointer, so this can be a lot simpler than the main argument code. 1415 assert(VA.isRegLoc() && "Memory locations not expected for call return"); 1416 1417 SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1418 InFlag); 1419 Chain = Val.getValue(1); 1420 InFlag = Val.getValue(2); 1421 1422 switch (VA.getLocInfo()) { 1423 default: llvm_unreachable("Unknown loc info!"); 1424 case CCValAssign::Full: break; 1425 case CCValAssign::BCvt: 1426 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1427 break; 1428 case CCValAssign::ZExt: 1429 case CCValAssign::SExt: 1430 case CCValAssign::AExt: 1431 // Floating-point arguments only get extended/truncated if they're going 1432 // in memory, so using the integer operation is acceptable here. 1433 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1434 break; 1435 } 1436 1437 InVals.push_back(Val); 1438 } 1439 1440 return Chain; 1441} 1442 1443bool 1444AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 1445 CallingConv::ID CalleeCC, 1446 bool IsVarArg, 1447 bool IsCalleeStructRet, 1448 bool IsCallerStructRet, 1449 const SmallVectorImpl<ISD::OutputArg> &Outs, 1450 const SmallVectorImpl<SDValue> &OutVals, 1451 const SmallVectorImpl<ISD::InputArg> &Ins, 1452 SelectionDAG& DAG) const { 1453 1454 // For CallingConv::C this function knows whether the ABI needs 1455 // changing. That's not true for other conventions so they will have to opt in 1456 // manually. 1457 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1458 return false; 1459 1460 const MachineFunction &MF = DAG.getMachineFunction(); 1461 const Function *CallerF = MF.getFunction(); 1462 CallingConv::ID CallerCC = CallerF->getCallingConv(); 1463 bool CCMatch = CallerCC == CalleeCC; 1464 1465 // Byval parameters hand the function a pointer directly into the stack area 1466 // we want to reuse during a tail call. Working around this *is* possible (see 1467 // X86) but less efficient and uglier in LowerCall. 1468 for (Function::const_arg_iterator i = CallerF->arg_begin(), 1469 e = CallerF->arg_end(); i != e; ++i) 1470 if (i->hasByValAttr()) 1471 return false; 1472 1473 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 1474 if (IsTailCallConvention(CalleeCC) && CCMatch) 1475 return true; 1476 return false; 1477 } 1478 1479 // Now we search for cases where we can use a tail call without changing the 1480 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 1481 // concept. 1482 1483 // I want anyone implementing a new calling convention to think long and hard 1484 // about this assert. 1485 assert((!IsVarArg || CalleeCC == CallingConv::C) 1486 && "Unexpected variadic calling convention"); 1487 1488 if (IsVarArg && !Outs.empty()) { 1489 // At least two cases here: if caller is fastcc then we can't have any 1490 // memory arguments (we'd be expected to clean up the stack afterwards). If 1491 // caller is C then we could potentially use its argument area. 1492 1493 // FIXME: for now we take the most conservative of these in both cases: 1494 // disallow all variadic memory operands. 1495 SmallVector<CCValAssign, 16> ArgLocs; 1496 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1497 getTargetMachine(), ArgLocs, *DAG.getContext()); 1498 1499 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1500 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 1501 if (!ArgLocs[i].isRegLoc()) 1502 return false; 1503 } 1504 1505 // If the calling conventions do not match, then we'd better make sure the 1506 // results are returned in the same way as what the caller expects. 1507 if (!CCMatch) { 1508 SmallVector<CCValAssign, 16> RVLocs1; 1509 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 1510 getTargetMachine(), RVLocs1, *DAG.getContext()); 1511 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC)); 1512 1513 SmallVector<CCValAssign, 16> RVLocs2; 1514 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 1515 getTargetMachine(), RVLocs2, *DAG.getContext()); 1516 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC)); 1517 1518 if (RVLocs1.size() != RVLocs2.size()) 1519 return false; 1520 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 1521 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 1522 return false; 1523 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 1524 return false; 1525 if (RVLocs1[i].isRegLoc()) { 1526 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 1527 return false; 1528 } else { 1529 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 1530 return false; 1531 } 1532 } 1533 } 1534 1535 // Nothing more to check if the callee is taking no arguments 1536 if (Outs.empty()) 1537 return true; 1538 1539 SmallVector<CCValAssign, 16> ArgLocs; 1540 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1541 getTargetMachine(), ArgLocs, *DAG.getContext()); 1542 1543 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1544 1545 const AArch64MachineFunctionInfo *FuncInfo 1546 = MF.getInfo<AArch64MachineFunctionInfo>(); 1547 1548 // If the stack arguments for this call would fit into our own save area then 1549 // the call can be made tail. 1550 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 1551} 1552 1553bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 1554 bool TailCallOpt) const { 1555 return CallCC == CallingConv::Fast && TailCallOpt; 1556} 1557 1558bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 1559 return CallCC == CallingConv::Fast; 1560} 1561 1562SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 1563 SelectionDAG &DAG, 1564 MachineFrameInfo *MFI, 1565 int ClobberedFI) const { 1566 SmallVector<SDValue, 8> ArgChains; 1567 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 1568 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 1569 1570 // Include the original chain at the beginning of the list. When this is 1571 // used by target LowerCall hooks, this helps legalize find the 1572 // CALLSEQ_BEGIN node. 1573 ArgChains.push_back(Chain); 1574 1575 // Add a chain value for each stack argument corresponding 1576 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1577 UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) 1578 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 1579 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 1580 if (FI->getIndex() < 0) { 1581 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 1582 int64_t InLastByte = InFirstByte; 1583 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 1584 1585 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1586 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1587 ArgChains.push_back(SDValue(L, 1)); 1588 } 1589 1590 // Build a tokenfactor for all the chains. 1591 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, 1592 &ArgChains[0], ArgChains.size()); 1593} 1594 1595static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) { 1596 switch (CC) { 1597 case ISD::SETEQ: return A64CC::EQ; 1598 case ISD::SETGT: return A64CC::GT; 1599 case ISD::SETGE: return A64CC::GE; 1600 case ISD::SETLT: return A64CC::LT; 1601 case ISD::SETLE: return A64CC::LE; 1602 case ISD::SETNE: return A64CC::NE; 1603 case ISD::SETUGT: return A64CC::HI; 1604 case ISD::SETUGE: return A64CC::HS; 1605 case ISD::SETULT: return A64CC::LO; 1606 case ISD::SETULE: return A64CC::LS; 1607 default: llvm_unreachable("Unexpected condition code"); 1608 } 1609} 1610 1611bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { 1612 // icmp is implemented using adds/subs immediate, which take an unsigned 1613 // 12-bit immediate, optionally shifted left by 12 bits. 1614 1615 // Symmetric by using adds/subs 1616 if (Val < 0) 1617 Val = -Val; 1618 1619 return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0; 1620} 1621 1622SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, 1623 ISD::CondCode CC, SDValue &A64cc, 1624 SelectionDAG &DAG, SDLoc &dl) const { 1625 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1626 int64_t C = 0; 1627 EVT VT = RHSC->getValueType(0); 1628 bool knownInvalid = false; 1629 1630 // I'm not convinced the rest of LLVM handles these edge cases properly, but 1631 // we can at least get it right. 1632 if (isSignedIntSetCC(CC)) { 1633 C = RHSC->getSExtValue(); 1634 } else if (RHSC->getZExtValue() > INT64_MAX) { 1635 // A 64-bit constant not representable by a signed 64-bit integer is far 1636 // too big to fit into a SUBS immediate anyway. 1637 knownInvalid = true; 1638 } else { 1639 C = RHSC->getZExtValue(); 1640 } 1641 1642 if (!knownInvalid && !isLegalICmpImmediate(C)) { 1643 // Constant does not fit, try adjusting it by one? 1644 switch (CC) { 1645 default: break; 1646 case ISD::SETLT: 1647 case ISD::SETGE: 1648 if (isLegalICmpImmediate(C-1)) { 1649 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1650 RHS = DAG.getConstant(C-1, VT); 1651 } 1652 break; 1653 case ISD::SETULT: 1654 case ISD::SETUGE: 1655 if (isLegalICmpImmediate(C-1)) { 1656 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1657 RHS = DAG.getConstant(C-1, VT); 1658 } 1659 break; 1660 case ISD::SETLE: 1661 case ISD::SETGT: 1662 if (isLegalICmpImmediate(C+1)) { 1663 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1664 RHS = DAG.getConstant(C+1, VT); 1665 } 1666 break; 1667 case ISD::SETULE: 1668 case ISD::SETUGT: 1669 if (isLegalICmpImmediate(C+1)) { 1670 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1671 RHS = DAG.getConstant(C+1, VT); 1672 } 1673 break; 1674 } 1675 } 1676 } 1677 1678 A64CC::CondCodes CondCode = IntCCToA64CC(CC); 1679 A64cc = DAG.getConstant(CondCode, MVT::i32); 1680 return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1681 DAG.getCondCode(CC)); 1682} 1683 1684static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, 1685 A64CC::CondCodes &Alternative) { 1686 A64CC::CondCodes CondCode = A64CC::Invalid; 1687 Alternative = A64CC::Invalid; 1688 1689 switch (CC) { 1690 default: llvm_unreachable("Unknown FP condition!"); 1691 case ISD::SETEQ: 1692 case ISD::SETOEQ: CondCode = A64CC::EQ; break; 1693 case ISD::SETGT: 1694 case ISD::SETOGT: CondCode = A64CC::GT; break; 1695 case ISD::SETGE: 1696 case ISD::SETOGE: CondCode = A64CC::GE; break; 1697 case ISD::SETOLT: CondCode = A64CC::MI; break; 1698 case ISD::SETOLE: CondCode = A64CC::LS; break; 1699 case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break; 1700 case ISD::SETO: CondCode = A64CC::VC; break; 1701 case ISD::SETUO: CondCode = A64CC::VS; break; 1702 case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break; 1703 case ISD::SETUGT: CondCode = A64CC::HI; break; 1704 case ISD::SETUGE: CondCode = A64CC::PL; break; 1705 case ISD::SETLT: 1706 case ISD::SETULT: CondCode = A64CC::LT; break; 1707 case ISD::SETLE: 1708 case ISD::SETULE: CondCode = A64CC::LE; break; 1709 case ISD::SETNE: 1710 case ISD::SETUNE: CondCode = A64CC::NE; break; 1711 } 1712 return CondCode; 1713} 1714 1715SDValue 1716AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 1717 SDLoc DL(Op); 1718 EVT PtrVT = getPointerTy(); 1719 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 1720 1721 switch(getTargetMachine().getCodeModel()) { 1722 case CodeModel::Small: 1723 // The most efficient code is PC-relative anyway for the small memory model, 1724 // so we don't need to worry about relocation model. 1725 return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 1726 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1727 AArch64II::MO_NO_FLAG), 1728 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1729 AArch64II::MO_LO12), 1730 DAG.getConstant(/*Alignment=*/ 4, MVT::i32)); 1731 case CodeModel::Large: 1732 return DAG.getNode( 1733 AArch64ISD::WrapperLarge, DL, PtrVT, 1734 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3), 1735 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 1736 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 1737 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 1738 default: 1739 llvm_unreachable("Only small and large code models supported now"); 1740 } 1741} 1742 1743 1744// (BRCOND chain, val, dest) 1745SDValue 1746AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1747 SDLoc dl(Op); 1748 SDValue Chain = Op.getOperand(0); 1749 SDValue TheBit = Op.getOperand(1); 1750 SDValue DestBB = Op.getOperand(2); 1751 1752 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 1753 // that as the consumer we are responsible for ignoring rubbish in higher 1754 // bits. 1755 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 1756 DAG.getConstant(1, MVT::i32)); 1757 1758 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 1759 DAG.getConstant(0, TheBit.getValueType()), 1760 DAG.getCondCode(ISD::SETNE)); 1761 1762 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain, 1763 A64CMP, DAG.getConstant(A64CC::NE, MVT::i32), 1764 DestBB); 1765} 1766 1767// (BR_CC chain, condcode, lhs, rhs, dest) 1768SDValue 1769AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 1770 SDLoc dl(Op); 1771 SDValue Chain = Op.getOperand(0); 1772 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 1773 SDValue LHS = Op.getOperand(2); 1774 SDValue RHS = Op.getOperand(3); 1775 SDValue DestBB = Op.getOperand(4); 1776 1777 if (LHS.getValueType() == MVT::f128) { 1778 // f128 comparisons are lowered to runtime calls by a routine which sets 1779 // LHS, RHS and CC appropriately for the rest of this function to continue. 1780 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 1781 1782 // If softenSetCCOperands returned a scalar, we need to compare the result 1783 // against zero to select between true and false values. 1784 if (RHS.getNode() == 0) { 1785 RHS = DAG.getConstant(0, LHS.getValueType()); 1786 CC = ISD::SETNE; 1787 } 1788 } 1789 1790 if (LHS.getValueType().isInteger()) { 1791 SDValue A64cc; 1792 1793 // Integers are handled in a separate function because the combinations of 1794 // immediates and tests can get hairy and we may want to fiddle things. 1795 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 1796 1797 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1798 Chain, CmpOp, A64cc, DestBB); 1799 } 1800 1801 // Note that some LLVM floating-point CondCodes can't be lowered to a single 1802 // conditional branch, hence FPCCToA64CC can set a second test, where either 1803 // passing is sufficient. 1804 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 1805 CondCode = FPCCToA64CC(CC, Alternative); 1806 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 1807 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1808 DAG.getCondCode(CC)); 1809 SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1810 Chain, SetCC, A64cc, DestBB); 1811 1812 if (Alternative != A64CC::Invalid) { 1813 A64cc = DAG.getConstant(Alternative, MVT::i32); 1814 A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1815 A64BR_CC, SetCC, A64cc, DestBB); 1816 1817 } 1818 1819 return A64BR_CC; 1820} 1821 1822SDValue 1823AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, 1824 RTLIB::Libcall Call) const { 1825 ArgListTy Args; 1826 ArgListEntry Entry; 1827 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 1828 EVT ArgVT = Op.getOperand(i).getValueType(); 1829 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1830 Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; 1831 Entry.isSExt = false; 1832 Entry.isZExt = false; 1833 Args.push_back(Entry); 1834 } 1835 SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy()); 1836 1837 Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext()); 1838 1839 // By default, the input chain to this libcall is the entry node of the 1840 // function. If the libcall is going to be emitted as a tail call then 1841 // isUsedByReturnOnly will change it to the right chain if the return 1842 // node which is being folded has a non-entry input chain. 1843 SDValue InChain = DAG.getEntryNode(); 1844 1845 // isTailCall may be true since the callee does not reference caller stack 1846 // frame. Check if it's in the right position. 1847 SDValue TCChain = InChain; 1848 bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain); 1849 if (isTailCall) 1850 InChain = TCChain; 1851 1852 TargetLowering:: 1853 CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 1854 0, getLibcallCallingConv(Call), isTailCall, 1855 /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, 1856 Callee, Args, DAG, SDLoc(Op)); 1857 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 1858 1859 if (!CallInfo.second.getNode()) 1860 // It's a tailcall, return the chain (which is the DAG root). 1861 return DAG.getRoot(); 1862 1863 return CallInfo.first; 1864} 1865 1866SDValue 1867AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 1868 if (Op.getOperand(0).getValueType() != MVT::f128) { 1869 // It's legal except when f128 is involved 1870 return Op; 1871 } 1872 1873 RTLIB::Libcall LC; 1874 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1875 1876 SDValue SrcVal = Op.getOperand(0); 1877 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 1878 /*isSigned*/ false, SDLoc(Op)); 1879} 1880 1881SDValue 1882AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 1883 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1884 1885 RTLIB::Libcall LC; 1886 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1887 1888 return LowerF128ToCall(Op, DAG, LC); 1889} 1890 1891SDValue 1892AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 1893 bool IsSigned) const { 1894 if (Op.getOperand(0).getValueType() != MVT::f128) { 1895 // It's legal except when f128 is involved 1896 return Op; 1897 } 1898 1899 RTLIB::Libcall LC; 1900 if (IsSigned) 1901 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1902 else 1903 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1904 1905 return LowerF128ToCall(Op, DAG, LC); 1906} 1907 1908SDValue 1909AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, 1910 SelectionDAG &DAG) const { 1911 assert(getTargetMachine().getCodeModel() == CodeModel::Large); 1912 assert(getTargetMachine().getRelocationModel() == Reloc::Static); 1913 1914 EVT PtrVT = getPointerTy(); 1915 SDLoc dl(Op); 1916 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 1917 const GlobalValue *GV = GN->getGlobal(); 1918 1919 SDValue GlobalAddr = DAG.getNode( 1920 AArch64ISD::WrapperLarge, dl, PtrVT, 1921 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3), 1922 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 1923 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 1924 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 1925 1926 if (GN->getOffset() != 0) 1927 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 1928 DAG.getConstant(GN->getOffset(), PtrVT)); 1929 1930 return GlobalAddr; 1931} 1932 1933SDValue 1934AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, 1935 SelectionDAG &DAG) const { 1936 assert(getTargetMachine().getCodeModel() == CodeModel::Small); 1937 1938 EVT PtrVT = getPointerTy(); 1939 SDLoc dl(Op); 1940 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 1941 const GlobalValue *GV = GN->getGlobal(); 1942 unsigned Alignment = GV->getAlignment(); 1943 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 1944 if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) { 1945 // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate 1946 // to zero when they remain undefined. In PIC mode the GOT can take care of 1947 // this, but in absolute mode we use a constant pool load. 1948 SDValue PoolAddr; 1949 PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1950 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1951 AArch64II::MO_NO_FLAG), 1952 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1953 AArch64II::MO_LO12), 1954 DAG.getConstant(8, MVT::i32)); 1955 SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr, 1956 MachinePointerInfo::getConstantPool(), 1957 /*isVolatile=*/ false, 1958 /*isNonTemporal=*/ true, 1959 /*isInvariant=*/ true, 8); 1960 if (GN->getOffset() != 0) 1961 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 1962 DAG.getConstant(GN->getOffset(), PtrVT)); 1963 1964 return GlobalAddr; 1965 } 1966 1967 if (Alignment == 0) { 1968 const PointerType *GVPtrTy = cast<PointerType>(GV->getType()); 1969 if (GVPtrTy->getElementType()->isSized()) { 1970 Alignment 1971 = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType()); 1972 } else { 1973 // Be conservative if we can't guess, not that it really matters: 1974 // functions and labels aren't valid for loads, and the methods used to 1975 // actually calculate an address work with any alignment. 1976 Alignment = 1; 1977 } 1978 } 1979 1980 unsigned char HiFixup, LoFixup; 1981 bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM); 1982 1983 if (UseGOT) { 1984 HiFixup = AArch64II::MO_GOT; 1985 LoFixup = AArch64II::MO_GOT_LO12; 1986 Alignment = 8; 1987 } else { 1988 HiFixup = AArch64II::MO_NO_FLAG; 1989 LoFixup = AArch64II::MO_LO12; 1990 } 1991 1992 // AArch64's small model demands the following sequence: 1993 // ADRP x0, somewhere 1994 // ADD x0, x0, #:lo12:somewhere ; (or LDR directly). 1995 SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1996 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1997 HiFixup), 1998 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1999 LoFixup), 2000 DAG.getConstant(Alignment, MVT::i32)); 2001 2002 if (UseGOT) { 2003 GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(), 2004 GlobalRef); 2005 } 2006 2007 if (GN->getOffset() != 0) 2008 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef, 2009 DAG.getConstant(GN->getOffset(), PtrVT)); 2010 2011 return GlobalRef; 2012} 2013 2014SDValue 2015AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, 2016 SelectionDAG &DAG) const { 2017 // TableGen doesn't have easy access to the CodeModel or RelocationModel, so 2018 // we make those distinctions here. 2019 2020 switch (getTargetMachine().getCodeModel()) { 2021 case CodeModel::Small: 2022 return LowerGlobalAddressELFSmall(Op, DAG); 2023 case CodeModel::Large: 2024 return LowerGlobalAddressELFLarge(Op, DAG); 2025 default: 2026 llvm_unreachable("Only small and large code models supported now"); 2027 } 2028} 2029 2030SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, 2031 SDValue DescAddr, 2032 SDLoc DL, 2033 SelectionDAG &DAG) const { 2034 EVT PtrVT = getPointerTy(); 2035 2036 // The function we need to call is simply the first entry in the GOT for this 2037 // descriptor, load it in preparation. 2038 SDValue Func, Chain; 2039 Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 2040 DescAddr); 2041 2042 // The function takes only one argument: the address of the descriptor itself 2043 // in X0. 2044 SDValue Glue; 2045 Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); 2046 Glue = Chain.getValue(1); 2047 2048 // Finally, there's a special calling-convention which means that the lookup 2049 // must preserve all registers (except X0, obviously). 2050 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 2051 const AArch64RegisterInfo *A64RI 2052 = static_cast<const AArch64RegisterInfo *>(TRI); 2053 const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask(); 2054 2055 // We're now ready to populate the argument list, as with a normal call: 2056 std::vector<SDValue> Ops; 2057 Ops.push_back(Chain); 2058 Ops.push_back(Func); 2059 Ops.push_back(SymAddr); 2060 Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); 2061 Ops.push_back(DAG.getRegisterMask(Mask)); 2062 Ops.push_back(Glue); 2063 2064 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2065 Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0], 2066 Ops.size()); 2067 Glue = Chain.getValue(1); 2068 2069 // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it 2070 // back to the generic handling code. 2071 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 2072} 2073 2074SDValue 2075AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 2076 SelectionDAG &DAG) const { 2077 assert(getSubtarget()->isTargetELF() && 2078 "TLS not implemented for non-ELF targets"); 2079 assert(getTargetMachine().getCodeModel() == CodeModel::Small 2080 && "TLS only supported in small memory model"); 2081 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2082 2083 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 2084 2085 SDValue TPOff; 2086 EVT PtrVT = getPointerTy(); 2087 SDLoc DL(Op); 2088 const GlobalValue *GV = GA->getGlobal(); 2089 2090 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 2091 2092 if (Model == TLSModel::InitialExec) { 2093 TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2094 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2095 AArch64II::MO_GOTTPREL), 2096 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2097 AArch64II::MO_GOTTPREL_LO12), 2098 DAG.getConstant(8, MVT::i32)); 2099 TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 2100 TPOff); 2101 } else if (Model == TLSModel::LocalExec) { 2102 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2103 AArch64II::MO_TPREL_G1); 2104 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2105 AArch64II::MO_TPREL_G0_NC); 2106 2107 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2108 DAG.getTargetConstant(1, MVT::i32)), 0); 2109 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2110 TPOff, LoVar, 2111 DAG.getTargetConstant(0, MVT::i32)), 0); 2112 } else if (Model == TLSModel::GeneralDynamic) { 2113 // Accesses used in this sequence go via the TLS descriptor which lives in 2114 // the GOT. Prepare an address we can use to handle this. 2115 SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2116 AArch64II::MO_TLSDESC); 2117 SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2118 AArch64II::MO_TLSDESC_LO12); 2119 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2120 HiDesc, LoDesc, 2121 DAG.getConstant(8, MVT::i32)); 2122 SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0); 2123 2124 TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2125 } else if (Model == TLSModel::LocalDynamic) { 2126 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 2127 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 2128 // the beginning of the module's TLS region, followed by a DTPREL offset 2129 // calculation. 2130 2131 // These accesses will need deduplicating if there's more than one. 2132 AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction() 2133 .getInfo<AArch64MachineFunctionInfo>(); 2134 MFI->incNumLocalDynamicTLSAccesses(); 2135 2136 2137 // Get the location of _TLS_MODULE_BASE_: 2138 SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2139 AArch64II::MO_TLSDESC); 2140 SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2141 AArch64II::MO_TLSDESC_LO12); 2142 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2143 HiDesc, LoDesc, 2144 DAG.getConstant(8, MVT::i32)); 2145 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT); 2146 2147 ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2148 2149 // Get the variable's offset from _TLS_MODULE_BASE_ 2150 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2151 AArch64II::MO_DTPREL_G1); 2152 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2153 AArch64II::MO_DTPREL_G0_NC); 2154 2155 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2156 DAG.getTargetConstant(0, MVT::i32)), 0); 2157 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2158 TPOff, LoVar, 2159 DAG.getTargetConstant(0, MVT::i32)), 0); 2160 } else 2161 llvm_unreachable("Unsupported TLS access model"); 2162 2163 2164 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 2165} 2166 2167SDValue 2168AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, 2169 bool IsSigned) const { 2170 if (Op.getValueType() != MVT::f128) { 2171 // Legal for everything except f128. 2172 return Op; 2173 } 2174 2175 RTLIB::Libcall LC; 2176 if (IsSigned) 2177 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2178 else 2179 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2180 2181 return LowerF128ToCall(Op, DAG, LC); 2182} 2183 2184 2185SDValue 2186AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2187 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2188 SDLoc dl(JT); 2189 EVT PtrVT = getPointerTy(); 2190 2191 // When compiling PIC, jump tables get put in the code section so a static 2192 // relocation-style is acceptable for both cases. 2193 switch (getTargetMachine().getCodeModel()) { 2194 case CodeModel::Small: 2195 return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 2196 DAG.getTargetJumpTable(JT->getIndex(), PtrVT), 2197 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2198 AArch64II::MO_LO12), 2199 DAG.getConstant(1, MVT::i32)); 2200 case CodeModel::Large: 2201 return DAG.getNode( 2202 AArch64ISD::WrapperLarge, dl, PtrVT, 2203 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3), 2204 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC), 2205 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC), 2206 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC)); 2207 default: 2208 llvm_unreachable("Only small and large code models supported now"); 2209 } 2210} 2211 2212// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) 2213SDValue 2214AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 2215 SDLoc dl(Op); 2216 SDValue LHS = Op.getOperand(0); 2217 SDValue RHS = Op.getOperand(1); 2218 SDValue IfTrue = Op.getOperand(2); 2219 SDValue IfFalse = Op.getOperand(3); 2220 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 2221 2222 if (LHS.getValueType() == MVT::f128) { 2223 // f128 comparisons are lowered to libcalls, but slot in nicely here 2224 // afterwards. 2225 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2226 2227 // If softenSetCCOperands returned a scalar, we need to compare the result 2228 // against zero to select between true and false values. 2229 if (RHS.getNode() == 0) { 2230 RHS = DAG.getConstant(0, LHS.getValueType()); 2231 CC = ISD::SETNE; 2232 } 2233 } 2234 2235 if (LHS.getValueType().isInteger()) { 2236 SDValue A64cc; 2237 2238 // Integers are handled in a separate function because the combinations of 2239 // immediates and tests can get hairy and we may want to fiddle things. 2240 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2241 2242 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2243 CmpOp, IfTrue, IfFalse, A64cc); 2244 } 2245 2246 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2247 // conditional branch, hence FPCCToA64CC can set a second test, where either 2248 // passing is sufficient. 2249 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2250 CondCode = FPCCToA64CC(CC, Alternative); 2251 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2252 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2253 DAG.getCondCode(CC)); 2254 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, 2255 Op.getValueType(), 2256 SetCC, IfTrue, IfFalse, A64cc); 2257 2258 if (Alternative != A64CC::Invalid) { 2259 A64cc = DAG.getConstant(Alternative, MVT::i32); 2260 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2261 SetCC, IfTrue, A64SELECT_CC, A64cc); 2262 2263 } 2264 2265 return A64SELECT_CC; 2266} 2267 2268// (SELECT testbit, iftrue, iffalse) 2269SDValue 2270AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2271 SDLoc dl(Op); 2272 SDValue TheBit = Op.getOperand(0); 2273 SDValue IfTrue = Op.getOperand(1); 2274 SDValue IfFalse = Op.getOperand(2); 2275 2276 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 2277 // that as the consumer we are responsible for ignoring rubbish in higher 2278 // bits. 2279 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 2280 DAG.getConstant(1, MVT::i32)); 2281 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 2282 DAG.getConstant(0, TheBit.getValueType()), 2283 DAG.getCondCode(ISD::SETNE)); 2284 2285 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2286 A64CMP, IfTrue, IfFalse, 2287 DAG.getConstant(A64CC::NE, MVT::i32)); 2288} 2289 2290static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) { 2291 SDLoc DL(Op); 2292 SDValue LHS = Op.getOperand(0); 2293 SDValue RHS = Op.getOperand(1); 2294 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2295 EVT VT = Op.getValueType(); 2296 bool Invert = false; 2297 SDValue Op0, Op1; 2298 unsigned Opcode; 2299 2300 if (LHS.getValueType().isInteger()) { 2301 2302 // Attempt to use Vector Integer Compare Mask Test instruction. 2303 // TST = icmp ne (and (op0, op1), zero). 2304 if (CC == ISD::SETNE) { 2305 if (((LHS.getOpcode() == ISD::AND) && 2306 ISD::isBuildVectorAllZeros(RHS.getNode())) || 2307 ((RHS.getOpcode() == ISD::AND) && 2308 ISD::isBuildVectorAllZeros(LHS.getNode()))) { 2309 2310 SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS; 2311 SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0)); 2312 SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1)); 2313 return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS); 2314 } 2315 } 2316 2317 // Attempt to use Vector Integer Compare Mask against Zero instr (Signed). 2318 // Note: Compare against Zero does not support unsigned predicates. 2319 if ((ISD::isBuildVectorAllZeros(RHS.getNode()) || 2320 ISD::isBuildVectorAllZeros(LHS.getNode())) && 2321 !isUnsignedIntSetCC(CC)) { 2322 2323 // If LHS is the zero value, swap operands and CondCode. 2324 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 2325 CC = getSetCCSwappedOperands(CC); 2326 Op0 = RHS; 2327 } else 2328 Op0 = LHS; 2329 2330 // Ensure valid CondCode for Compare Mask against Zero instruction: 2331 // EQ, GE, GT, LE, LT. 2332 if (ISD::SETNE == CC) { 2333 Invert = true; 2334 CC = ISD::SETEQ; 2335 } 2336 2337 // Using constant type to differentiate integer and FP compares with zero. 2338 Op1 = DAG.getConstant(0, MVT::i32); 2339 Opcode = AArch64ISD::NEON_CMPZ; 2340 2341 } else { 2342 // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned). 2343 // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT. 2344 bool Swap = false; 2345 switch (CC) { 2346 default: 2347 llvm_unreachable("Illegal integer comparison."); 2348 case ISD::SETEQ: 2349 case ISD::SETGT: 2350 case ISD::SETGE: 2351 case ISD::SETUGT: 2352 case ISD::SETUGE: 2353 break; 2354 case ISD::SETNE: 2355 Invert = true; 2356 CC = ISD::SETEQ; 2357 break; 2358 case ISD::SETULT: 2359 case ISD::SETULE: 2360 case ISD::SETLT: 2361 case ISD::SETLE: 2362 Swap = true; 2363 CC = getSetCCSwappedOperands(CC); 2364 } 2365 2366 if (Swap) 2367 std::swap(LHS, RHS); 2368 2369 Opcode = AArch64ISD::NEON_CMP; 2370 Op0 = LHS; 2371 Op1 = RHS; 2372 } 2373 2374 // Generate Compare Mask instr or Compare Mask against Zero instr. 2375 SDValue NeonCmp = 2376 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); 2377 2378 if (Invert) 2379 NeonCmp = DAG.getNOT(DL, NeonCmp, VT); 2380 2381 return NeonCmp; 2382 } 2383 2384 // Now handle Floating Point cases. 2385 // Attempt to use Vector Floating Point Compare Mask against Zero instruction. 2386 if (ISD::isBuildVectorAllZeros(RHS.getNode()) || 2387 ISD::isBuildVectorAllZeros(LHS.getNode())) { 2388 2389 // If LHS is the zero value, swap operands and CondCode. 2390 if (ISD::isBuildVectorAllZeros(LHS.getNode())) { 2391 CC = getSetCCSwappedOperands(CC); 2392 Op0 = RHS; 2393 } else 2394 Op0 = LHS; 2395 2396 // Using constant type to differentiate integer and FP compares with zero. 2397 Op1 = DAG.getConstantFP(0, MVT::f32); 2398 Opcode = AArch64ISD::NEON_CMPZ; 2399 } else { 2400 // Attempt to use Vector Floating Point Compare Mask instruction. 2401 Op0 = LHS; 2402 Op1 = RHS; 2403 Opcode = AArch64ISD::NEON_CMP; 2404 } 2405 2406 SDValue NeonCmpAlt; 2407 // Some register compares have to be implemented with swapped CC and operands, 2408 // e.g.: OLT implemented as OGT with swapped operands. 2409 bool SwapIfRegArgs = false; 2410 2411 // Ensure valid CondCode for FP Compare Mask against Zero instruction: 2412 // EQ, GE, GT, LE, LT. 2413 // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT. 2414 switch (CC) { 2415 default: 2416 llvm_unreachable("Illegal FP comparison"); 2417 case ISD::SETUNE: 2418 case ISD::SETNE: 2419 Invert = true; // Fallthrough 2420 case ISD::SETOEQ: 2421 case ISD::SETEQ: 2422 CC = ISD::SETEQ; 2423 break; 2424 case ISD::SETOLT: 2425 case ISD::SETLT: 2426 CC = ISD::SETLT; 2427 SwapIfRegArgs = true; 2428 break; 2429 case ISD::SETOGT: 2430 case ISD::SETGT: 2431 CC = ISD::SETGT; 2432 break; 2433 case ISD::SETOLE: 2434 case ISD::SETLE: 2435 CC = ISD::SETLE; 2436 SwapIfRegArgs = true; 2437 break; 2438 case ISD::SETOGE: 2439 case ISD::SETGE: 2440 CC = ISD::SETGE; 2441 break; 2442 case ISD::SETUGE: 2443 Invert = true; 2444 CC = ISD::SETLT; 2445 SwapIfRegArgs = true; 2446 break; 2447 case ISD::SETULE: 2448 Invert = true; 2449 CC = ISD::SETGT; 2450 break; 2451 case ISD::SETUGT: 2452 Invert = true; 2453 CC = ISD::SETLE; 2454 SwapIfRegArgs = true; 2455 break; 2456 case ISD::SETULT: 2457 Invert = true; 2458 CC = ISD::SETGE; 2459 break; 2460 case ISD::SETUEQ: 2461 Invert = true; // Fallthrough 2462 case ISD::SETONE: 2463 // Expand this to (OGT |OLT). 2464 NeonCmpAlt = 2465 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT)); 2466 CC = ISD::SETLT; 2467 SwapIfRegArgs = true; 2468 break; 2469 case ISD::SETUO: 2470 Invert = true; // Fallthrough 2471 case ISD::SETO: 2472 // Expand this to (OGE | OLT). 2473 NeonCmpAlt = 2474 DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE)); 2475 CC = ISD::SETLT; 2476 SwapIfRegArgs = true; 2477 break; 2478 } 2479 2480 if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) { 2481 CC = getSetCCSwappedOperands(CC); 2482 std::swap(Op0, Op1); 2483 } 2484 2485 // Generate FP Compare Mask instr or FP Compare Mask against Zero instr 2486 SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC)); 2487 2488 if (NeonCmpAlt.getNode()) 2489 NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt); 2490 2491 if (Invert) 2492 NeonCmp = DAG.getNOT(DL, NeonCmp, VT); 2493 2494 return NeonCmp; 2495} 2496 2497// (SETCC lhs, rhs, condcode) 2498SDValue 2499AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2500 SDLoc dl(Op); 2501 SDValue LHS = Op.getOperand(0); 2502 SDValue RHS = Op.getOperand(1); 2503 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2504 EVT VT = Op.getValueType(); 2505 2506 if (VT.isVector()) 2507 return LowerVectorSETCC(Op, DAG); 2508 2509 if (LHS.getValueType() == MVT::f128) { 2510 // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS 2511 // for the rest of the function (some i32 or i64 values). 2512 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2513 2514 // If softenSetCCOperands returned a scalar, use it. 2515 if (RHS.getNode() == 0) { 2516 assert(LHS.getValueType() == Op.getValueType() && 2517 "Unexpected setcc expansion!"); 2518 return LHS; 2519 } 2520 } 2521 2522 if (LHS.getValueType().isInteger()) { 2523 SDValue A64cc; 2524 2525 // Integers are handled in a separate function because the combinations of 2526 // immediates and tests can get hairy and we may want to fiddle things. 2527 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2528 2529 return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2530 CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT), 2531 A64cc); 2532 } 2533 2534 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2535 // conditional branch, hence FPCCToA64CC can set a second test, where either 2536 // passing is sufficient. 2537 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2538 CondCode = FPCCToA64CC(CC, Alternative); 2539 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2540 SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2541 DAG.getCondCode(CC)); 2542 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2543 CmpOp, DAG.getConstant(1, VT), 2544 DAG.getConstant(0, VT), A64cc); 2545 2546 if (Alternative != A64CC::Invalid) { 2547 A64cc = DAG.getConstant(Alternative, MVT::i32); 2548 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp, 2549 DAG.getConstant(1, VT), A64SELECT_CC, A64cc); 2550 } 2551 2552 return A64SELECT_CC; 2553} 2554 2555SDValue 2556AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2557 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2558 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2559 2560 // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes 2561 // rather than just 8. 2562 return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), 2563 Op.getOperand(1), Op.getOperand(2), 2564 DAG.getConstant(32, MVT::i32), 8, false, false, 2565 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 2566} 2567 2568SDValue 2569AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2570 // The layout of the va_list struct is specified in the AArch64 Procedure Call 2571 // Standard, section B.3. 2572 MachineFunction &MF = DAG.getMachineFunction(); 2573 AArch64MachineFunctionInfo *FuncInfo 2574 = MF.getInfo<AArch64MachineFunctionInfo>(); 2575 SDLoc DL(Op); 2576 2577 SDValue Chain = Op.getOperand(0); 2578 SDValue VAList = Op.getOperand(1); 2579 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2580 SmallVector<SDValue, 4> MemOps; 2581 2582 // void *__stack at offset 0 2583 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(), 2584 getPointerTy()); 2585 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 2586 MachinePointerInfo(SV), false, false, 0)); 2587 2588 // void *__gr_top at offset 8 2589 int GPRSize = FuncInfo->getVariadicGPRSize(); 2590 if (GPRSize > 0) { 2591 SDValue GRTop, GRTopAddr; 2592 2593 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2594 DAG.getConstant(8, getPointerTy())); 2595 2596 GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy()); 2597 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 2598 DAG.getConstant(GPRSize, getPointerTy())); 2599 2600 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 2601 MachinePointerInfo(SV, 8), 2602 false, false, 0)); 2603 } 2604 2605 // void *__vr_top at offset 16 2606 int FPRSize = FuncInfo->getVariadicFPRSize(); 2607 if (FPRSize > 0) { 2608 SDValue VRTop, VRTopAddr; 2609 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2610 DAG.getConstant(16, getPointerTy())); 2611 2612 VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy()); 2613 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 2614 DAG.getConstant(FPRSize, getPointerTy())); 2615 2616 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 2617 MachinePointerInfo(SV, 16), 2618 false, false, 0)); 2619 } 2620 2621 // int __gr_offs at offset 24 2622 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2623 DAG.getConstant(24, getPointerTy())); 2624 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 2625 GROffsAddr, MachinePointerInfo(SV, 24), 2626 false, false, 0)); 2627 2628 // int __vr_offs at offset 28 2629 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2630 DAG.getConstant(28, getPointerTy())); 2631 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 2632 VROffsAddr, MachinePointerInfo(SV, 28), 2633 false, false, 0)); 2634 2635 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 2636 MemOps.size()); 2637} 2638 2639SDValue 2640AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2641 switch (Op.getOpcode()) { 2642 default: llvm_unreachable("Don't know how to custom lower this!"); 2643 case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128); 2644 case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128); 2645 case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128); 2646 case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128); 2647 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true); 2648 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false); 2649 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true); 2650 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); 2651 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 2652 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 2653 2654 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 2655 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 2656 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 2657 case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG); 2658 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 2659 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 2660 case ISD::SELECT: return LowerSELECT(Op, DAG); 2661 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 2662 case ISD::SETCC: return LowerSETCC(Op, DAG); 2663 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 2664 case ISD::VASTART: return LowerVASTART(Op, DAG); 2665 case ISD::BUILD_VECTOR: 2666 return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); 2667 } 2668 2669 return SDValue(); 2670} 2671 2672/// Check if the specified splat value corresponds to a valid vector constant 2673/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If 2674/// so, return the encoded 8-bit immediate and the OpCmode instruction fields 2675/// values. 2676static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 2677 unsigned SplatBitSize, SelectionDAG &DAG, 2678 bool is128Bits, NeonModImmType type, EVT &VT, 2679 unsigned &Imm, unsigned &OpCmode) { 2680 switch (SplatBitSize) { 2681 default: 2682 llvm_unreachable("unexpected size for isNeonModifiedImm"); 2683 case 8: { 2684 if (type != Neon_Mov_Imm) 2685 return false; 2686 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 2687 // Neon movi per byte: Op=0, Cmode=1110. 2688 OpCmode = 0xe; 2689 Imm = SplatBits; 2690 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 2691 break; 2692 } 2693 case 16: { 2694 // Neon move inst per halfword 2695 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 2696 if ((SplatBits & ~0xff) == 0) { 2697 // Value = 0x00nn is 0x00nn LSL 0 2698 // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000 2699 // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001 2700 // Op=x, Cmode=100y 2701 Imm = SplatBits; 2702 OpCmode = 0x8; 2703 break; 2704 } 2705 if ((SplatBits & ~0xff00) == 0) { 2706 // Value = 0xnn00 is 0x00nn LSL 8 2707 // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010 2708 // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011 2709 // Op=x, Cmode=101x 2710 Imm = SplatBits >> 8; 2711 OpCmode = 0xa; 2712 break; 2713 } 2714 // can't handle any other 2715 return false; 2716 } 2717 2718 case 32: { 2719 // First the LSL variants (MSL is unusable by some interested instructions). 2720 2721 // Neon move instr per word, shift zeros 2722 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 2723 if ((SplatBits & ~0xff) == 0) { 2724 // Value = 0x000000nn is 0x000000nn LSL 0 2725 // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000 2726 // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001 2727 // Op=x, Cmode=000x 2728 Imm = SplatBits; 2729 OpCmode = 0; 2730 break; 2731 } 2732 if ((SplatBits & ~0xff00) == 0) { 2733 // Value = 0x0000nn00 is 0x000000nn LSL 8 2734 // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010 2735 // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011 2736 // Op=x, Cmode=001x 2737 Imm = SplatBits >> 8; 2738 OpCmode = 0x2; 2739 break; 2740 } 2741 if ((SplatBits & ~0xff0000) == 0) { 2742 // Value = 0x00nn0000 is 0x000000nn LSL 16 2743 // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100 2744 // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101 2745 // Op=x, Cmode=010x 2746 Imm = SplatBits >> 16; 2747 OpCmode = 0x4; 2748 break; 2749 } 2750 if ((SplatBits & ~0xff000000) == 0) { 2751 // Value = 0xnn000000 is 0x000000nn LSL 24 2752 // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110 2753 // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111 2754 // Op=x, Cmode=011x 2755 Imm = SplatBits >> 24; 2756 OpCmode = 0x6; 2757 break; 2758 } 2759 2760 // Now the MSL immediates. 2761 2762 // Neon move instr per word, shift ones 2763 if ((SplatBits & ~0xffff) == 0 && 2764 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 2765 // Value = 0x0000nnff is 0x000000nn MSL 8 2766 // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100 2767 // Op=x, Cmode=1100 2768 Imm = SplatBits >> 8; 2769 OpCmode = 0xc; 2770 break; 2771 } 2772 if ((SplatBits & ~0xffffff) == 0 && 2773 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 2774 // Value = 0x00nnffff is 0x000000nn MSL 16 2775 // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101 2776 // Op=x, Cmode=1101 2777 Imm = SplatBits >> 16; 2778 OpCmode = 0xd; 2779 break; 2780 } 2781 // can't handle any other 2782 return false; 2783 } 2784 2785 case 64: { 2786 if (type != Neon_Mov_Imm) 2787 return false; 2788 // Neon move instr bytemask, where each byte is either 0x00 or 0xff. 2789 // movi Op=1, Cmode=1110. 2790 OpCmode = 0x1e; 2791 uint64_t BitMask = 0xff; 2792 uint64_t Val = 0; 2793 unsigned ImmMask = 1; 2794 Imm = 0; 2795 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 2796 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 2797 Val |= BitMask; 2798 Imm |= ImmMask; 2799 } else if ((SplatBits & BitMask) != 0) { 2800 return false; 2801 } 2802 BitMask <<= 8; 2803 ImmMask <<= 1; 2804 } 2805 SplatBits = Val; 2806 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 2807 break; 2808 } 2809 } 2810 2811 return true; 2812} 2813 2814static SDValue PerformANDCombine(SDNode *N, 2815 TargetLowering::DAGCombinerInfo &DCI) { 2816 2817 SelectionDAG &DAG = DCI.DAG; 2818 SDLoc DL(N); 2819 EVT VT = N->getValueType(0); 2820 2821 // We're looking for an SRA/SHL pair which form an SBFX. 2822 2823 if (VT != MVT::i32 && VT != MVT::i64) 2824 return SDValue(); 2825 2826 if (!isa<ConstantSDNode>(N->getOperand(1))) 2827 return SDValue(); 2828 2829 uint64_t TruncMask = N->getConstantOperandVal(1); 2830 if (!isMask_64(TruncMask)) 2831 return SDValue(); 2832 2833 uint64_t Width = CountPopulation_64(TruncMask); 2834 SDValue Shift = N->getOperand(0); 2835 2836 if (Shift.getOpcode() != ISD::SRL) 2837 return SDValue(); 2838 2839 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 2840 return SDValue(); 2841 uint64_t LSB = Shift->getConstantOperandVal(1); 2842 2843 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 2844 return SDValue(); 2845 2846 return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0), 2847 DAG.getConstant(LSB, MVT::i64), 2848 DAG.getConstant(LSB + Width - 1, MVT::i64)); 2849} 2850 2851/// For a true bitfield insert, the bits getting into that contiguous mask 2852/// should come from the low part of an existing value: they must be formed from 2853/// a compatible SHL operation (unless they're already low). This function 2854/// checks that condition and returns the least-significant bit that's 2855/// intended. If the operation not a field preparation, -1 is returned. 2856static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT, 2857 SDValue &MaskedVal, uint64_t Mask) { 2858 if (!isShiftedMask_64(Mask)) 2859 return -1; 2860 2861 // Now we need to alter MaskedVal so that it is an appropriate input for a BFI 2862 // instruction. BFI will do a left-shift by LSB before applying the mask we've 2863 // spotted, so in general we should pre-emptively "undo" that by making sure 2864 // the incoming bits have had a right-shift applied to them. 2865 // 2866 // This right shift, however, will combine with existing left/right shifts. In 2867 // the simplest case of a completely straight bitfield operation, it will be 2868 // expected to completely cancel out with an existing SHL. More complicated 2869 // cases (e.g. bitfield to bitfield copy) may still need a real shift before 2870 // the BFI. 2871 2872 uint64_t LSB = countTrailingZeros(Mask); 2873 int64_t ShiftRightRequired = LSB; 2874 if (MaskedVal.getOpcode() == ISD::SHL && 2875 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2876 ShiftRightRequired -= MaskedVal.getConstantOperandVal(1); 2877 MaskedVal = MaskedVal.getOperand(0); 2878 } else if (MaskedVal.getOpcode() == ISD::SRL && 2879 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2880 ShiftRightRequired += MaskedVal.getConstantOperandVal(1); 2881 MaskedVal = MaskedVal.getOperand(0); 2882 } 2883 2884 if (ShiftRightRequired > 0) 2885 MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal, 2886 DAG.getConstant(ShiftRightRequired, MVT::i64)); 2887 else if (ShiftRightRequired < 0) { 2888 // We could actually end up with a residual left shift, for example with 2889 // "struc.bitfield = val << 1". 2890 MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal, 2891 DAG.getConstant(-ShiftRightRequired, MVT::i64)); 2892 } 2893 2894 return LSB; 2895} 2896 2897/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by 2898/// a mask and an extension. Returns true if a BFI was found and provides 2899/// information on its surroundings. 2900static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask, 2901 bool &Extended) { 2902 Extended = false; 2903 if (N.getOpcode() == ISD::ZERO_EXTEND) { 2904 Extended = true; 2905 N = N.getOperand(0); 2906 } 2907 2908 if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { 2909 Mask = N->getConstantOperandVal(1); 2910 N = N.getOperand(0); 2911 } else { 2912 // Mask is the whole width. 2913 Mask = -1ULL >> (64 - N.getValueType().getSizeInBits()); 2914 } 2915 2916 if (N.getOpcode() == AArch64ISD::BFI) { 2917 BFI = N; 2918 return true; 2919 } 2920 2921 return false; 2922} 2923 2924/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which 2925/// is roughly equivalent to (and (BFI ...), mask). This form is used because it 2926/// can often be further combined with a larger mask. Ultimately, we want mask 2927/// to be 2^32-1 or 2^64-1 so the AND can be skipped. 2928static SDValue tryCombineToBFI(SDNode *N, 2929 TargetLowering::DAGCombinerInfo &DCI, 2930 const AArch64Subtarget *Subtarget) { 2931 SelectionDAG &DAG = DCI.DAG; 2932 SDLoc DL(N); 2933 EVT VT = N->getValueType(0); 2934 2935 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 2936 2937 // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or 2938 // abandon the effort. 2939 SDValue LHS = N->getOperand(0); 2940 if (LHS.getOpcode() != ISD::AND) 2941 return SDValue(); 2942 2943 uint64_t LHSMask; 2944 if (isa<ConstantSDNode>(LHS.getOperand(1))) 2945 LHSMask = LHS->getConstantOperandVal(1); 2946 else 2947 return SDValue(); 2948 2949 // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask 2950 // is or abandon the effort. 2951 SDValue RHS = N->getOperand(1); 2952 if (RHS.getOpcode() != ISD::AND) 2953 return SDValue(); 2954 2955 uint64_t RHSMask; 2956 if (isa<ConstantSDNode>(RHS.getOperand(1))) 2957 RHSMask = RHS->getConstantOperandVal(1); 2958 else 2959 return SDValue(); 2960 2961 // Can't do anything if the masks are incompatible. 2962 if (LHSMask & RHSMask) 2963 return SDValue(); 2964 2965 // Now we need one of the masks to be a contiguous field. Without loss of 2966 // generality that should be the RHS one. 2967 SDValue Bitfield = LHS.getOperand(0); 2968 if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) { 2969 // We know that LHS is a candidate new value, and RHS isn't already a better 2970 // one. 2971 std::swap(LHS, RHS); 2972 std::swap(LHSMask, RHSMask); 2973 } 2974 2975 // We've done our best to put the right operands in the right places, all we 2976 // can do now is check whether a BFI exists. 2977 Bitfield = RHS.getOperand(0); 2978 int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask); 2979 if (LSB == -1) 2980 return SDValue(); 2981 2982 uint32_t Width = CountPopulation_64(RHSMask); 2983 assert(Width && "Expected non-zero bitfield width"); 2984 2985 SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 2986 LHS.getOperand(0), Bitfield, 2987 DAG.getConstant(LSB, MVT::i64), 2988 DAG.getConstant(Width, MVT::i64)); 2989 2990 // Mask is trivial 2991 if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 2992 return BFI; 2993 2994 return DAG.getNode(ISD::AND, DL, VT, BFI, 2995 DAG.getConstant(LHSMask | RHSMask, VT)); 2996} 2997 2998/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its 2999/// original input. This is surprisingly common because SROA splits things up 3000/// into i8 chunks, so the originally detected MaskedBFI may actually only act 3001/// on the low (say) byte of a word. This is then orred into the rest of the 3002/// word afterwards. 3003/// 3004/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)). 3005/// 3006/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the 3007/// MaskedBFI. We can also deal with a certain amount of extend/truncate being 3008/// involved. 3009static SDValue tryCombineToLargerBFI(SDNode *N, 3010 TargetLowering::DAGCombinerInfo &DCI, 3011 const AArch64Subtarget *Subtarget) { 3012 SelectionDAG &DAG = DCI.DAG; 3013 SDLoc DL(N); 3014 EVT VT = N->getValueType(0); 3015 3016 // First job is to hunt for a MaskedBFI on either the left or right. Swap 3017 // operands if it's actually on the right. 3018 SDValue BFI; 3019 SDValue PossExtraMask; 3020 uint64_t ExistingMask = 0; 3021 bool Extended = false; 3022 if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended)) 3023 PossExtraMask = N->getOperand(1); 3024 else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended)) 3025 PossExtraMask = N->getOperand(0); 3026 else 3027 return SDValue(); 3028 3029 // We can only combine a BFI with another compatible mask. 3030 if (PossExtraMask.getOpcode() != ISD::AND || 3031 !isa<ConstantSDNode>(PossExtraMask.getOperand(1))) 3032 return SDValue(); 3033 3034 uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1); 3035 3036 // Masks must be compatible. 3037 if (ExtraMask & ExistingMask) 3038 return SDValue(); 3039 3040 SDValue OldBFIVal = BFI.getOperand(0); 3041 SDValue NewBFIVal = BFI.getOperand(1); 3042 if (Extended) { 3043 // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be 3044 // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments 3045 // need to be made compatible. 3046 assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32 3047 && "Invalid types for BFI"); 3048 OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal); 3049 NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal); 3050 } 3051 3052 // We need the MaskedBFI to be combined with a mask of the *same* value. 3053 if (PossExtraMask.getOperand(0) != OldBFIVal) 3054 return SDValue(); 3055 3056 BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 3057 OldBFIVal, NewBFIVal, 3058 BFI.getOperand(2), BFI.getOperand(3)); 3059 3060 // If the masking is trivial, we don't need to create it. 3061 if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 3062 return BFI; 3063 3064 return DAG.getNode(ISD::AND, DL, VT, BFI, 3065 DAG.getConstant(ExtraMask | ExistingMask, VT)); 3066} 3067 3068/// An EXTR instruction is made up of two shifts, ORed together. This helper 3069/// searches for and classifies those shifts. 3070static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 3071 bool &FromHi) { 3072 if (N.getOpcode() == ISD::SHL) 3073 FromHi = false; 3074 else if (N.getOpcode() == ISD::SRL) 3075 FromHi = true; 3076 else 3077 return false; 3078 3079 if (!isa<ConstantSDNode>(N.getOperand(1))) 3080 return false; 3081 3082 ShiftAmount = N->getConstantOperandVal(1); 3083 Src = N->getOperand(0); 3084 return true; 3085} 3086 3087/// EXTR instruction extracts a contiguous chunk of bits from two existing 3088/// registers viewed as a high/low pair. This function looks for the pattern: 3089/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 3090/// EXTR. Can't quite be done in TableGen because the two immediates aren't 3091/// independent. 3092static SDValue tryCombineToEXTR(SDNode *N, 3093 TargetLowering::DAGCombinerInfo &DCI) { 3094 SelectionDAG &DAG = DCI.DAG; 3095 SDLoc DL(N); 3096 EVT VT = N->getValueType(0); 3097 3098 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 3099 3100 if (VT != MVT::i32 && VT != MVT::i64) 3101 return SDValue(); 3102 3103 SDValue LHS; 3104 uint32_t ShiftLHS = 0; 3105 bool LHSFromHi = 0; 3106 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 3107 return SDValue(); 3108 3109 SDValue RHS; 3110 uint32_t ShiftRHS = 0; 3111 bool RHSFromHi = 0; 3112 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 3113 return SDValue(); 3114 3115 // If they're both trying to come from the high part of the register, they're 3116 // not really an EXTR. 3117 if (LHSFromHi == RHSFromHi) 3118 return SDValue(); 3119 3120 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 3121 return SDValue(); 3122 3123 if (LHSFromHi) { 3124 std::swap(LHS, RHS); 3125 std::swap(ShiftLHS, ShiftRHS); 3126 } 3127 3128 return DAG.getNode(AArch64ISD::EXTR, DL, VT, 3129 LHS, RHS, 3130 DAG.getConstant(ShiftRHS, MVT::i64)); 3131} 3132 3133/// Target-specific dag combine xforms for ISD::OR 3134static SDValue PerformORCombine(SDNode *N, 3135 TargetLowering::DAGCombinerInfo &DCI, 3136 const AArch64Subtarget *Subtarget) { 3137 3138 SelectionDAG &DAG = DCI.DAG; 3139 SDLoc DL(N); 3140 EVT VT = N->getValueType(0); 3141 3142 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 3143 return SDValue(); 3144 3145 // Attempt to recognise bitfield-insert operations. 3146 SDValue Res = tryCombineToBFI(N, DCI, Subtarget); 3147 if (Res.getNode()) 3148 return Res; 3149 3150 // Attempt to combine an existing MaskedBFI operation into one with a larger 3151 // mask. 3152 Res = tryCombineToLargerBFI(N, DCI, Subtarget); 3153 if (Res.getNode()) 3154 return Res; 3155 3156 Res = tryCombineToEXTR(N, DCI); 3157 if (Res.getNode()) 3158 return Res; 3159 3160 if (!Subtarget->hasNEON()) 3161 return SDValue(); 3162 3163 // Attempt to use vector immediate-form BSL 3164 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 3165 3166 SDValue N0 = N->getOperand(0); 3167 if (N0.getOpcode() != ISD::AND) 3168 return SDValue(); 3169 3170 SDValue N1 = N->getOperand(1); 3171 if (N1.getOpcode() != ISD::AND) 3172 return SDValue(); 3173 3174 if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 3175 APInt SplatUndef; 3176 unsigned SplatBitSize; 3177 bool HasAnyUndefs; 3178 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 3179 APInt SplatBits0; 3180 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 3181 HasAnyUndefs) && 3182 !HasAnyUndefs) { 3183 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 3184 APInt SplatBits1; 3185 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 3186 HasAnyUndefs) && 3187 !HasAnyUndefs && SplatBits0 == ~SplatBits1) { 3188 // Canonicalize the vector type to make instruction selection simpler. 3189 EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8; 3190 SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT, 3191 N0->getOperand(1), N0->getOperand(0), 3192 N1->getOperand(0)); 3193 return DAG.getNode(ISD::BITCAST, DL, VT, Result); 3194 } 3195 } 3196 } 3197 3198 return SDValue(); 3199} 3200 3201/// Target-specific dag combine xforms for ISD::SRA 3202static SDValue PerformSRACombine(SDNode *N, 3203 TargetLowering::DAGCombinerInfo &DCI) { 3204 3205 SelectionDAG &DAG = DCI.DAG; 3206 SDLoc DL(N); 3207 EVT VT = N->getValueType(0); 3208 3209 // We're looking for an SRA/SHL pair which form an SBFX. 3210 3211 if (VT != MVT::i32 && VT != MVT::i64) 3212 return SDValue(); 3213 3214 if (!isa<ConstantSDNode>(N->getOperand(1))) 3215 return SDValue(); 3216 3217 uint64_t ExtraSignBits = N->getConstantOperandVal(1); 3218 SDValue Shift = N->getOperand(0); 3219 3220 if (Shift.getOpcode() != ISD::SHL) 3221 return SDValue(); 3222 3223 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 3224 return SDValue(); 3225 3226 uint64_t BitsOnLeft = Shift->getConstantOperandVal(1); 3227 uint64_t Width = VT.getSizeInBits() - ExtraSignBits; 3228 uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft; 3229 3230 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 3231 return SDValue(); 3232 3233 return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0), 3234 DAG.getConstant(LSB, MVT::i64), 3235 DAG.getConstant(LSB + Width - 1, MVT::i64)); 3236} 3237 3238 3239SDValue 3240AArch64TargetLowering::PerformDAGCombine(SDNode *N, 3241 DAGCombinerInfo &DCI) const { 3242 switch (N->getOpcode()) { 3243 default: break; 3244 case ISD::AND: return PerformANDCombine(N, DCI); 3245 case ISD::OR: return PerformORCombine(N, DCI, getSubtarget()); 3246 case ISD::SRA: return PerformSRACombine(N, DCI); 3247 } 3248 return SDValue(); 3249} 3250 3251bool 3252AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 3253 VT = VT.getScalarType(); 3254 3255 if (!VT.isSimple()) 3256 return false; 3257 3258 switch (VT.getSimpleVT().SimpleTy) { 3259 case MVT::f16: 3260 case MVT::f32: 3261 case MVT::f64: 3262 return true; 3263 case MVT::f128: 3264 return false; 3265 default: 3266 break; 3267 } 3268 3269 return false; 3270} 3271 3272// If this is a case we can't handle, return null and let the default 3273// expansion code take care of it. 3274SDValue 3275AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 3276 const AArch64Subtarget *ST) const { 3277 3278 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 3279 SDLoc DL(Op); 3280 EVT VT = Op.getValueType(); 3281 3282 APInt SplatBits, SplatUndef; 3283 unsigned SplatBitSize; 3284 bool HasAnyUndefs; 3285 3286 // Note we favor lowering MOVI over MVNI. 3287 // This has implications on the definition of patterns in TableGen to select 3288 // BIC immediate instructions but not ORR immediate instructions. 3289 // If this lowering order is changed, TableGen patterns for BIC immediate and 3290 // ORR immediate instructions have to be updated. 3291 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 3292 if (SplatBitSize <= 64) { 3293 // First attempt to use vector immediate-form MOVI 3294 EVT NeonMovVT; 3295 unsigned Imm = 0; 3296 unsigned OpCmode = 0; 3297 3298 if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 3299 SplatBitSize, DAG, VT.is128BitVector(), 3300 Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) { 3301 SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); 3302 SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); 3303 3304 if (ImmVal.getNode() && OpCmodeVal.getNode()) { 3305 SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT, 3306 ImmVal, OpCmodeVal); 3307 return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); 3308 } 3309 } 3310 3311 // Then attempt to use vector immediate-form MVNI 3312 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 3313 if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, 3314 DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT, 3315 Imm, OpCmode)) { 3316 SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32); 3317 SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32); 3318 if (ImmVal.getNode() && OpCmodeVal.getNode()) { 3319 SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT, 3320 ImmVal, OpCmodeVal); 3321 return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov); 3322 } 3323 } 3324 3325 // Attempt to use vector immediate-form FMOV 3326 if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) || 3327 (VT == MVT::v2f64 && SplatBitSize == 64)) { 3328 APFloat RealVal( 3329 SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble, 3330 SplatBits); 3331 uint32_t ImmVal; 3332 if (A64Imms::isFPImm(RealVal, ImmVal)) { 3333 SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32); 3334 return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val); 3335 } 3336 } 3337 } 3338 } 3339 return SDValue(); 3340} 3341 3342AArch64TargetLowering::ConstraintType 3343AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 3344 if (Constraint.size() == 1) { 3345 switch (Constraint[0]) { 3346 default: break; 3347 case 'w': // An FP/SIMD vector register 3348 return C_RegisterClass; 3349 case 'I': // Constant that can be used with an ADD instruction 3350 case 'J': // Constant that can be used with a SUB instruction 3351 case 'K': // Constant that can be used with a 32-bit logical instruction 3352 case 'L': // Constant that can be used with a 64-bit logical instruction 3353 case 'M': // Constant that can be used as a 32-bit MOV immediate 3354 case 'N': // Constant that can be used as a 64-bit MOV immediate 3355 case 'Y': // Floating point constant zero 3356 case 'Z': // Integer constant zero 3357 return C_Other; 3358 case 'Q': // A memory reference with base register and no offset 3359 return C_Memory; 3360 case 'S': // A symbolic address 3361 return C_Other; 3362 } 3363 } 3364 3365 // FIXME: Ump, Utf, Usa, Ush 3366 // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes, 3367 // whatever they may be 3368 // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be 3369 // Usa: An absolute symbolic address 3370 // Ush: The high part (bits 32:12) of a pc-relative symbolic address 3371 assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa" 3372 && Constraint != "Ush" && "Unimplemented constraints"); 3373 3374 return TargetLowering::getConstraintType(Constraint); 3375} 3376 3377TargetLowering::ConstraintWeight 3378AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info, 3379 const char *Constraint) const { 3380 3381 llvm_unreachable("Constraint weight unimplemented"); 3382} 3383 3384void 3385AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 3386 std::string &Constraint, 3387 std::vector<SDValue> &Ops, 3388 SelectionDAG &DAG) const { 3389 SDValue Result(0, 0); 3390 3391 // Only length 1 constraints are C_Other. 3392 if (Constraint.size() != 1) return; 3393 3394 // Only C_Other constraints get lowered like this. That means constants for us 3395 // so return early if there's no hope the constraint can be lowered. 3396 3397 switch(Constraint[0]) { 3398 default: break; 3399 case 'I': case 'J': case 'K': case 'L': 3400 case 'M': case 'N': case 'Z': { 3401 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 3402 if (!C) 3403 return; 3404 3405 uint64_t CVal = C->getZExtValue(); 3406 uint32_t Bits; 3407 3408 switch (Constraint[0]) { 3409 default: 3410 // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J' 3411 // is a peculiarly useless SUB constraint. 3412 llvm_unreachable("Unimplemented C_Other constraint"); 3413 case 'I': 3414 if (CVal <= 0xfff) 3415 break; 3416 return; 3417 case 'K': 3418 if (A64Imms::isLogicalImm(32, CVal, Bits)) 3419 break; 3420 return; 3421 case 'L': 3422 if (A64Imms::isLogicalImm(64, CVal, Bits)) 3423 break; 3424 return; 3425 case 'Z': 3426 if (CVal == 0) 3427 break; 3428 return; 3429 } 3430 3431 Result = DAG.getTargetConstant(CVal, Op.getValueType()); 3432 break; 3433 } 3434 case 'S': { 3435 // An absolute symbolic address or label reference. 3436 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 3437 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 3438 GA->getValueType(0)); 3439 } else if (const BlockAddressSDNode *BA 3440 = dyn_cast<BlockAddressSDNode>(Op)) { 3441 Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), 3442 BA->getValueType(0)); 3443 } else if (const ExternalSymbolSDNode *ES 3444 = dyn_cast<ExternalSymbolSDNode>(Op)) { 3445 Result = DAG.getTargetExternalSymbol(ES->getSymbol(), 3446 ES->getValueType(0)); 3447 } else 3448 return; 3449 break; 3450 } 3451 case 'Y': 3452 if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 3453 if (CFP->isExactlyValue(0.0)) { 3454 Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0)); 3455 break; 3456 } 3457 } 3458 return; 3459 } 3460 3461 if (Result.getNode()) { 3462 Ops.push_back(Result); 3463 return; 3464 } 3465 3466 // It's an unknown constraint for us. Let generic code have a go. 3467 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3468} 3469 3470std::pair<unsigned, const TargetRegisterClass*> 3471AArch64TargetLowering::getRegForInlineAsmConstraint( 3472 const std::string &Constraint, 3473 MVT VT) const { 3474 if (Constraint.size() == 1) { 3475 switch (Constraint[0]) { 3476 case 'r': 3477 if (VT.getSizeInBits() <= 32) 3478 return std::make_pair(0U, &AArch64::GPR32RegClass); 3479 else if (VT == MVT::i64) 3480 return std::make_pair(0U, &AArch64::GPR64RegClass); 3481 break; 3482 case 'w': 3483 if (VT == MVT::f16) 3484 return std::make_pair(0U, &AArch64::FPR16RegClass); 3485 else if (VT == MVT::f32) 3486 return std::make_pair(0U, &AArch64::FPR32RegClass); 3487 else if (VT == MVT::f64) 3488 return std::make_pair(0U, &AArch64::FPR64RegClass); 3489 else if (VT.getSizeInBits() == 64) 3490 return std::make_pair(0U, &AArch64::VPR64RegClass); 3491 else if (VT == MVT::f128) 3492 return std::make_pair(0U, &AArch64::FPR128RegClass); 3493 else if (VT.getSizeInBits() == 128) 3494 return std::make_pair(0U, &AArch64::VPR128RegClass); 3495 break; 3496 } 3497 } 3498 3499 // Use the default implementation in TargetLowering to convert the register 3500 // constraint into a member of a register class. 3501 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 3502} 3503