AArch64ISelLowering.cpp revision a0ec3f9b7b826b9b40b80199923b664bad808cce
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that AArch64 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "aarch64-isel" 16#include "AArch64.h" 17#include "AArch64ISelLowering.h" 18#include "AArch64MachineFunctionInfo.h" 19#include "AArch64TargetMachine.h" 20#include "AArch64TargetObjectFile.h" 21#include "Utils/AArch64BaseInfo.h" 22#include "llvm/CodeGen/Analysis.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 28#include "llvm/IR/CallingConv.h" 29 30using namespace llvm; 31 32static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) { 33 const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 34 35 if (Subtarget->isTargetLinux()) 36 return new AArch64LinuxTargetObjectFile(); 37 if (Subtarget->isTargetELF()) 38 return new TargetLoweringObjectFileELF(); 39 llvm_unreachable("unknown subtarget type"); 40} 41 42AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) 43 : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) { 44 45 // SIMD compares set the entire lane's bits to 1 46 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 47 48 // Scalar register <-> type mapping 49 addRegisterClass(MVT::i32, &AArch64::GPR32RegClass); 50 addRegisterClass(MVT::i64, &AArch64::GPR64RegClass); 51 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 52 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 53 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 54 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 55 56 computeRegisterProperties(); 57 58 // We combine OR nodes for bitfield and NEON BSL operations. 59 setTargetDAGCombine(ISD::OR); 60 61 setTargetDAGCombine(ISD::AND); 62 setTargetDAGCombine(ISD::SRA); 63 64 // AArch64 does not have i1 loads, or much of anything for i1 really. 65 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 66 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 67 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 68 69 setStackPointerRegisterToSaveRestore(AArch64::XSP); 70 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 71 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 72 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 73 74 // We'll lower globals to wrappers for selection. 75 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 76 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 77 78 // A64 instructions have the comparison predicate attached to the user of the 79 // result, but having a separate comparison is valuable for matching. 80 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 81 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 82 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 83 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 84 85 setOperationAction(ISD::SELECT, MVT::i32, Custom); 86 setOperationAction(ISD::SELECT, MVT::i64, Custom); 87 setOperationAction(ISD::SELECT, MVT::f32, Custom); 88 setOperationAction(ISD::SELECT, MVT::f64, Custom); 89 90 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 91 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 92 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 93 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 94 95 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 96 97 setOperationAction(ISD::SETCC, MVT::i32, Custom); 98 setOperationAction(ISD::SETCC, MVT::i64, Custom); 99 setOperationAction(ISD::SETCC, MVT::f32, Custom); 100 setOperationAction(ISD::SETCC, MVT::f64, Custom); 101 102 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 103 setOperationAction(ISD::JumpTable, MVT::i32, Custom); 104 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 105 106 setOperationAction(ISD::VASTART, MVT::Other, Custom); 107 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 108 setOperationAction(ISD::VAEND, MVT::Other, Expand); 109 setOperationAction(ISD::VAARG, MVT::Other, Expand); 110 111 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 112 113 setOperationAction(ISD::ROTL, MVT::i32, Expand); 114 setOperationAction(ISD::ROTL, MVT::i64, Expand); 115 116 setOperationAction(ISD::UREM, MVT::i32, Expand); 117 setOperationAction(ISD::UREM, MVT::i64, Expand); 118 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 119 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 120 121 setOperationAction(ISD::SREM, MVT::i32, Expand); 122 setOperationAction(ISD::SREM, MVT::i64, Expand); 123 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 124 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 125 126 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 127 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 128 129 // Legal floating-point operations. 130 setOperationAction(ISD::FABS, MVT::f32, Legal); 131 setOperationAction(ISD::FABS, MVT::f64, Legal); 132 133 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 134 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 135 136 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 137 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 138 139 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 140 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 141 142 setOperationAction(ISD::FNEG, MVT::f32, Legal); 143 setOperationAction(ISD::FNEG, MVT::f64, Legal); 144 145 setOperationAction(ISD::FRINT, MVT::f32, Legal); 146 setOperationAction(ISD::FRINT, MVT::f64, Legal); 147 148 setOperationAction(ISD::FSQRT, MVT::f32, Legal); 149 setOperationAction(ISD::FSQRT, MVT::f64, Legal); 150 151 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 152 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 153 154 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 155 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 156 setOperationAction(ISD::ConstantFP, MVT::f128, Legal); 157 158 // Illegal floating-point operations. 159 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 160 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 161 162 setOperationAction(ISD::FCOS, MVT::f32, Expand); 163 setOperationAction(ISD::FCOS, MVT::f64, Expand); 164 165 setOperationAction(ISD::FEXP, MVT::f32, Expand); 166 setOperationAction(ISD::FEXP, MVT::f64, Expand); 167 168 setOperationAction(ISD::FEXP2, MVT::f32, Expand); 169 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 170 171 setOperationAction(ISD::FLOG, MVT::f32, Expand); 172 setOperationAction(ISD::FLOG, MVT::f64, Expand); 173 174 setOperationAction(ISD::FLOG2, MVT::f32, Expand); 175 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 176 177 setOperationAction(ISD::FLOG10, MVT::f32, Expand); 178 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 179 180 setOperationAction(ISD::FPOW, MVT::f32, Expand); 181 setOperationAction(ISD::FPOW, MVT::f64, Expand); 182 183 setOperationAction(ISD::FPOWI, MVT::f32, Expand); 184 setOperationAction(ISD::FPOWI, MVT::f64, Expand); 185 186 setOperationAction(ISD::FREM, MVT::f32, Expand); 187 setOperationAction(ISD::FREM, MVT::f64, Expand); 188 189 setOperationAction(ISD::FSIN, MVT::f32, Expand); 190 setOperationAction(ISD::FSIN, MVT::f64, Expand); 191 192 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 193 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 194 195 // Virtually no operation on f128 is legal, but LLVM can't expand them when 196 // there's a valid register class, so we need custom operations in most cases. 197 setOperationAction(ISD::FABS, MVT::f128, Expand); 198 setOperationAction(ISD::FADD, MVT::f128, Custom); 199 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 200 setOperationAction(ISD::FCOS, MVT::f128, Expand); 201 setOperationAction(ISD::FDIV, MVT::f128, Custom); 202 setOperationAction(ISD::FMA, MVT::f128, Expand); 203 setOperationAction(ISD::FMUL, MVT::f128, Custom); 204 setOperationAction(ISD::FNEG, MVT::f128, Expand); 205 setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); 206 setOperationAction(ISD::FP_ROUND, MVT::f128, Expand); 207 setOperationAction(ISD::FPOW, MVT::f128, Expand); 208 setOperationAction(ISD::FREM, MVT::f128, Expand); 209 setOperationAction(ISD::FRINT, MVT::f128, Expand); 210 setOperationAction(ISD::FSIN, MVT::f128, Expand); 211 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 212 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 213 setOperationAction(ISD::FSUB, MVT::f128, Custom); 214 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 215 setOperationAction(ISD::SETCC, MVT::f128, Custom); 216 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 217 setOperationAction(ISD::SELECT, MVT::f128, Expand); 218 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 219 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 220 221 // Lowering for many of the conversions is actually specified by the non-f128 222 // type. The LowerXXX function will be trivial when f128 isn't involved. 223 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 224 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 225 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 226 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 227 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 228 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 229 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 230 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 231 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 232 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 233 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 234 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 235 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 236 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 237 238 // This prevents LLVM trying to compress double constants into a floating 239 // constant-pool entry and trying to load from there. It's of doubtful benefit 240 // for A64: we'd need LDR followed by FCVT, I believe. 241 setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); 242 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 243 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 244 245 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 246 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 247 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 248 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 249 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 250 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 251 252 setExceptionPointerRegister(AArch64::X0); 253 setExceptionSelectorRegister(AArch64::X1); 254} 255 256EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 257 // It's reasonably important that this value matches the "natural" legal 258 // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself 259 // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64). 260 if (!VT.isVector()) return MVT::i32; 261 return VT.changeVectorElementTypeToInteger(); 262} 263 264static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord, 265 unsigned &LdrOpc, 266 unsigned &StrOpc) { 267 static unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword, 268 AArch64::LDXR_word, AArch64::LDXR_dword}; 269 static unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword, 270 AArch64::LDAXR_word, AArch64::LDAXR_dword}; 271 static unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword, 272 AArch64::STXR_word, AArch64::STXR_dword}; 273 static unsigned StoreRels[] = {AArch64::STLXR_byte, AArch64::STLXR_hword, 274 AArch64::STLXR_word, AArch64::STLXR_dword}; 275 276 unsigned *LoadOps, *StoreOps; 277 if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent) 278 LoadOps = LoadAcqs; 279 else 280 LoadOps = LoadBares; 281 282 if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent) 283 StoreOps = StoreRels; 284 else 285 StoreOps = StoreBares; 286 287 assert(isPowerOf2_32(Size) && Size <= 8 && 288 "unsupported size for atomic binary op!"); 289 290 LdrOpc = LoadOps[Log2_32(Size)]; 291 StrOpc = StoreOps[Log2_32(Size)]; 292} 293 294MachineBasicBlock * 295AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, 296 unsigned Size, 297 unsigned BinOpcode) const { 298 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. 299 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 300 301 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 302 MachineFunction *MF = BB->getParent(); 303 MachineFunction::iterator It = BB; 304 ++It; 305 306 unsigned dest = MI->getOperand(0).getReg(); 307 unsigned ptr = MI->getOperand(1).getReg(); 308 unsigned incr = MI->getOperand(2).getReg(); 309 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 310 DebugLoc dl = MI->getDebugLoc(); 311 312 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 313 314 unsigned ldrOpc, strOpc; 315 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 316 317 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 318 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 319 MF->insert(It, loopMBB); 320 MF->insert(It, exitMBB); 321 322 // Transfer the remainder of BB and its successor edges to exitMBB. 323 exitMBB->splice(exitMBB->begin(), BB, 324 llvm::next(MachineBasicBlock::iterator(MI)), 325 BB->end()); 326 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 327 328 const TargetRegisterClass *TRC 329 = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; 330 unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC); 331 332 // thisMBB: 333 // ... 334 // fallthrough --> loopMBB 335 BB->addSuccessor(loopMBB); 336 337 // loopMBB: 338 // ldxr dest, ptr 339 // <binop> scratch, dest, incr 340 // stxr stxr_status, scratch, ptr 341 // cbnz stxr_status, loopMBB 342 // fallthrough --> exitMBB 343 BB = loopMBB; 344 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 345 if (BinOpcode) { 346 // All arithmetic operations we'll be creating are designed to take an extra 347 // shift or extend operand, which we can conveniently set to zero. 348 349 // Operand order needs to go the other way for NAND. 350 if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl) 351 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 352 .addReg(incr).addReg(dest).addImm(0); 353 else 354 BuildMI(BB, dl, TII->get(BinOpcode), scratch) 355 .addReg(dest).addReg(incr).addImm(0); 356 } 357 358 // From the stxr, the register is GPR32; from the cmp it's GPR32wsp 359 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 360 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 361 362 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr); 363 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 364 .addReg(stxr_status).addMBB(loopMBB); 365 366 BB->addSuccessor(loopMBB); 367 BB->addSuccessor(exitMBB); 368 369 // exitMBB: 370 // ... 371 BB = exitMBB; 372 373 MI->eraseFromParent(); // The instruction is gone now. 374 375 return BB; 376} 377 378MachineBasicBlock * 379AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI, 380 MachineBasicBlock *BB, 381 unsigned Size, 382 unsigned CmpOp, 383 A64CC::CondCodes Cond) const { 384 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 385 386 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 387 MachineFunction *MF = BB->getParent(); 388 MachineFunction::iterator It = BB; 389 ++It; 390 391 unsigned dest = MI->getOperand(0).getReg(); 392 unsigned ptr = MI->getOperand(1).getReg(); 393 unsigned incr = MI->getOperand(2).getReg(); 394 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm()); 395 396 unsigned oldval = dest; 397 DebugLoc dl = MI->getDebugLoc(); 398 399 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 400 const TargetRegisterClass *TRC, *TRCsp; 401 if (Size == 8) { 402 TRC = &AArch64::GPR64RegClass; 403 TRCsp = &AArch64::GPR64xspRegClass; 404 } else { 405 TRC = &AArch64::GPR32RegClass; 406 TRCsp = &AArch64::GPR32wspRegClass; 407 } 408 409 unsigned ldrOpc, strOpc; 410 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 411 412 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 413 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 414 MF->insert(It, loopMBB); 415 MF->insert(It, exitMBB); 416 417 // Transfer the remainder of BB and its successor edges to exitMBB. 418 exitMBB->splice(exitMBB->begin(), BB, 419 llvm::next(MachineBasicBlock::iterator(MI)), 420 BB->end()); 421 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 422 423 unsigned scratch = MRI.createVirtualRegister(TRC); 424 MRI.constrainRegClass(scratch, TRCsp); 425 426 // thisMBB: 427 // ... 428 // fallthrough --> loopMBB 429 BB->addSuccessor(loopMBB); 430 431 // loopMBB: 432 // ldxr dest, ptr 433 // cmp incr, dest (, sign extend if necessary) 434 // csel scratch, dest, incr, cond 435 // stxr stxr_status, scratch, ptr 436 // cbnz stxr_status, loopMBB 437 // fallthrough --> exitMBB 438 BB = loopMBB; 439 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 440 441 // Build compare and cmov instructions. 442 MRI.constrainRegClass(incr, TRCsp); 443 BuildMI(BB, dl, TII->get(CmpOp)) 444 .addReg(incr).addReg(oldval).addImm(0); 445 446 BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc), 447 scratch) 448 .addReg(oldval).addReg(incr).addImm(Cond); 449 450 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 451 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 452 453 BuildMI(BB, dl, TII->get(strOpc), stxr_status) 454 .addReg(scratch).addReg(ptr); 455 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 456 .addReg(stxr_status).addMBB(loopMBB); 457 458 BB->addSuccessor(loopMBB); 459 BB->addSuccessor(exitMBB); 460 461 // exitMBB: 462 // ... 463 BB = exitMBB; 464 465 MI->eraseFromParent(); // The instruction is gone now. 466 467 return BB; 468} 469 470MachineBasicBlock * 471AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI, 472 MachineBasicBlock *BB, 473 unsigned Size) const { 474 unsigned dest = MI->getOperand(0).getReg(); 475 unsigned ptr = MI->getOperand(1).getReg(); 476 unsigned oldval = MI->getOperand(2).getReg(); 477 unsigned newval = MI->getOperand(3).getReg(); 478 AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm()); 479 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 480 DebugLoc dl = MI->getDebugLoc(); 481 482 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 483 const TargetRegisterClass *TRCsp; 484 TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass; 485 486 unsigned ldrOpc, strOpc; 487 getExclusiveOperation(Size, Ord, ldrOpc, strOpc); 488 489 MachineFunction *MF = BB->getParent(); 490 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 491 MachineFunction::iterator It = BB; 492 ++It; // insert the new blocks after the current block 493 494 MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); 495 MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); 496 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 497 MF->insert(It, loop1MBB); 498 MF->insert(It, loop2MBB); 499 MF->insert(It, exitMBB); 500 501 // Transfer the remainder of BB and its successor edges to exitMBB. 502 exitMBB->splice(exitMBB->begin(), BB, 503 llvm::next(MachineBasicBlock::iterator(MI)), 504 BB->end()); 505 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 506 507 // thisMBB: 508 // ... 509 // fallthrough --> loop1MBB 510 BB->addSuccessor(loop1MBB); 511 512 // loop1MBB: 513 // ldxr dest, [ptr] 514 // cmp dest, oldval 515 // b.ne exitMBB 516 BB = loop1MBB; 517 BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr); 518 519 unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl; 520 MRI.constrainRegClass(dest, TRCsp); 521 BuildMI(BB, dl, TII->get(CmpOp)) 522 .addReg(dest).addReg(oldval).addImm(0); 523 BuildMI(BB, dl, TII->get(AArch64::Bcc)) 524 .addImm(A64CC::NE).addMBB(exitMBB); 525 BB->addSuccessor(loop2MBB); 526 BB->addSuccessor(exitMBB); 527 528 // loop2MBB: 529 // strex stxr_status, newval, [ptr] 530 // cbnz stxr_status, loop1MBB 531 BB = loop2MBB; 532 unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 533 MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass); 534 535 BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr); 536 BuildMI(BB, dl, TII->get(AArch64::CBNZw)) 537 .addReg(stxr_status).addMBB(loop1MBB); 538 BB->addSuccessor(loop1MBB); 539 BB->addSuccessor(exitMBB); 540 541 // exitMBB: 542 // ... 543 BB = exitMBB; 544 545 MI->eraseFromParent(); // The instruction is gone now. 546 547 return BB; 548} 549 550MachineBasicBlock * 551AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 552 MachineBasicBlock *MBB) const { 553 // We materialise the F128CSEL pseudo-instruction using conditional branches 554 // and loads, giving an instruciton sequence like: 555 // str q0, [sp] 556 // b.ne IfTrue 557 // b Finish 558 // IfTrue: 559 // str q1, [sp] 560 // Finish: 561 // ldr q0, [sp] 562 // 563 // Using virtual registers would probably not be beneficial since COPY 564 // instructions are expensive for f128 (there's no actual instruction to 565 // implement them). 566 // 567 // An alternative would be to do an integer-CSEL on some address. E.g.: 568 // mov x0, sp 569 // add x1, sp, #16 570 // str q0, [x0] 571 // str q1, [x1] 572 // csel x0, x0, x1, ne 573 // ldr q0, [x0] 574 // 575 // It's unclear which approach is actually optimal. 576 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 577 MachineFunction *MF = MBB->getParent(); 578 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 579 DebugLoc DL = MI->getDebugLoc(); 580 MachineFunction::iterator It = MBB; 581 ++It; 582 583 unsigned DestReg = MI->getOperand(0).getReg(); 584 unsigned IfTrueReg = MI->getOperand(1).getReg(); 585 unsigned IfFalseReg = MI->getOperand(2).getReg(); 586 unsigned CondCode = MI->getOperand(3).getImm(); 587 bool NZCVKilled = MI->getOperand(4).isKill(); 588 589 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 590 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 591 MF->insert(It, TrueBB); 592 MF->insert(It, EndBB); 593 594 // Transfer rest of current basic-block to EndBB 595 EndBB->splice(EndBB->begin(), MBB, 596 llvm::next(MachineBasicBlock::iterator(MI)), 597 MBB->end()); 598 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 599 600 // We need somewhere to store the f128 value needed. 601 int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16); 602 603 // [... start of incoming MBB ...] 604 // str qIFFALSE, [sp] 605 // b.cc IfTrue 606 // b Done 607 BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR)) 608 .addReg(IfFalseReg) 609 .addFrameIndex(ScratchFI) 610 .addImm(0); 611 BuildMI(MBB, DL, TII->get(AArch64::Bcc)) 612 .addImm(CondCode) 613 .addMBB(TrueBB); 614 BuildMI(MBB, DL, TII->get(AArch64::Bimm)) 615 .addMBB(EndBB); 616 MBB->addSuccessor(TrueBB); 617 MBB->addSuccessor(EndBB); 618 619 // IfTrue: 620 // str qIFTRUE, [sp] 621 BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR)) 622 .addReg(IfTrueReg) 623 .addFrameIndex(ScratchFI) 624 .addImm(0); 625 626 // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the 627 // blocks. 628 TrueBB->addSuccessor(EndBB); 629 630 // Done: 631 // ldr qDEST, [sp] 632 // [... rest of incoming MBB ...] 633 if (!NZCVKilled) 634 EndBB->addLiveIn(AArch64::NZCV); 635 MachineInstr *StartOfEnd = EndBB->begin(); 636 BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg) 637 .addFrameIndex(ScratchFI) 638 .addImm(0); 639 640 MI->eraseFromParent(); 641 return EndBB; 642} 643 644MachineBasicBlock * 645AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 646 MachineBasicBlock *MBB) const { 647 switch (MI->getOpcode()) { 648 default: llvm_unreachable("Unhandled instruction with custom inserter"); 649 case AArch64::F128CSEL: 650 return EmitF128CSEL(MI, MBB); 651 case AArch64::ATOMIC_LOAD_ADD_I8: 652 return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl); 653 case AArch64::ATOMIC_LOAD_ADD_I16: 654 return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl); 655 case AArch64::ATOMIC_LOAD_ADD_I32: 656 return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl); 657 case AArch64::ATOMIC_LOAD_ADD_I64: 658 return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl); 659 660 case AArch64::ATOMIC_LOAD_SUB_I8: 661 return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl); 662 case AArch64::ATOMIC_LOAD_SUB_I16: 663 return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl); 664 case AArch64::ATOMIC_LOAD_SUB_I32: 665 return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl); 666 case AArch64::ATOMIC_LOAD_SUB_I64: 667 return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl); 668 669 case AArch64::ATOMIC_LOAD_AND_I8: 670 return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl); 671 case AArch64::ATOMIC_LOAD_AND_I16: 672 return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl); 673 case AArch64::ATOMIC_LOAD_AND_I32: 674 return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl); 675 case AArch64::ATOMIC_LOAD_AND_I64: 676 return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl); 677 678 case AArch64::ATOMIC_LOAD_OR_I8: 679 return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl); 680 case AArch64::ATOMIC_LOAD_OR_I16: 681 return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl); 682 case AArch64::ATOMIC_LOAD_OR_I32: 683 return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl); 684 case AArch64::ATOMIC_LOAD_OR_I64: 685 return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl); 686 687 case AArch64::ATOMIC_LOAD_XOR_I8: 688 return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl); 689 case AArch64::ATOMIC_LOAD_XOR_I16: 690 return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl); 691 case AArch64::ATOMIC_LOAD_XOR_I32: 692 return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl); 693 case AArch64::ATOMIC_LOAD_XOR_I64: 694 return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl); 695 696 case AArch64::ATOMIC_LOAD_NAND_I8: 697 return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl); 698 case AArch64::ATOMIC_LOAD_NAND_I16: 699 return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl); 700 case AArch64::ATOMIC_LOAD_NAND_I32: 701 return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl); 702 case AArch64::ATOMIC_LOAD_NAND_I64: 703 return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl); 704 705 case AArch64::ATOMIC_LOAD_MIN_I8: 706 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT); 707 case AArch64::ATOMIC_LOAD_MIN_I16: 708 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT); 709 case AArch64::ATOMIC_LOAD_MIN_I32: 710 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT); 711 case AArch64::ATOMIC_LOAD_MIN_I64: 712 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT); 713 714 case AArch64::ATOMIC_LOAD_MAX_I8: 715 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT); 716 case AArch64::ATOMIC_LOAD_MAX_I16: 717 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT); 718 case AArch64::ATOMIC_LOAD_MAX_I32: 719 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT); 720 case AArch64::ATOMIC_LOAD_MAX_I64: 721 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT); 722 723 case AArch64::ATOMIC_LOAD_UMIN_I8: 724 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI); 725 case AArch64::ATOMIC_LOAD_UMIN_I16: 726 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI); 727 case AArch64::ATOMIC_LOAD_UMIN_I32: 728 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI); 729 case AArch64::ATOMIC_LOAD_UMIN_I64: 730 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI); 731 732 case AArch64::ATOMIC_LOAD_UMAX_I8: 733 return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO); 734 case AArch64::ATOMIC_LOAD_UMAX_I16: 735 return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO); 736 case AArch64::ATOMIC_LOAD_UMAX_I32: 737 return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO); 738 case AArch64::ATOMIC_LOAD_UMAX_I64: 739 return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO); 740 741 case AArch64::ATOMIC_SWAP_I8: 742 return emitAtomicBinary(MI, MBB, 1, 0); 743 case AArch64::ATOMIC_SWAP_I16: 744 return emitAtomicBinary(MI, MBB, 2, 0); 745 case AArch64::ATOMIC_SWAP_I32: 746 return emitAtomicBinary(MI, MBB, 4, 0); 747 case AArch64::ATOMIC_SWAP_I64: 748 return emitAtomicBinary(MI, MBB, 8, 0); 749 750 case AArch64::ATOMIC_CMP_SWAP_I8: 751 return emitAtomicCmpSwap(MI, MBB, 1); 752 case AArch64::ATOMIC_CMP_SWAP_I16: 753 return emitAtomicCmpSwap(MI, MBB, 2); 754 case AArch64::ATOMIC_CMP_SWAP_I32: 755 return emitAtomicCmpSwap(MI, MBB, 4); 756 case AArch64::ATOMIC_CMP_SWAP_I64: 757 return emitAtomicCmpSwap(MI, MBB, 8); 758 } 759} 760 761 762const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 763 switch (Opcode) { 764 case AArch64ISD::BR_CC: return "AArch64ISD::BR_CC"; 765 case AArch64ISD::Call: return "AArch64ISD::Call"; 766 case AArch64ISD::FPMOV: return "AArch64ISD::FPMOV"; 767 case AArch64ISD::GOTLoad: return "AArch64ISD::GOTLoad"; 768 case AArch64ISD::BFI: return "AArch64ISD::BFI"; 769 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 770 case AArch64ISD::Ret: return "AArch64ISD::Ret"; 771 case AArch64ISD::SBFX: return "AArch64ISD::SBFX"; 772 case AArch64ISD::SELECT_CC: return "AArch64ISD::SELECT_CC"; 773 case AArch64ISD::SETCC: return "AArch64ISD::SETCC"; 774 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 775 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 776 case AArch64ISD::TLSDESCCALL: return "AArch64ISD::TLSDESCCALL"; 777 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 778 case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall"; 779 780 default: return NULL; 781 } 782} 783 784static const uint16_t AArch64FPRArgRegs[] = { 785 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 786 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7 787}; 788static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs); 789 790static const uint16_t AArch64ArgRegs[] = { 791 AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, 792 AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 793}; 794static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs); 795 796static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT, 797 CCValAssign::LocInfo LocInfo, 798 ISD::ArgFlagsTy ArgFlags, CCState &State) { 799 // Mark all remaining general purpose registers as allocated. We don't 800 // backtrack: if (for example) an i128 gets put on the stack, no subsequent 801 // i64 will go in registers (C.11). 802 for (unsigned i = 0; i < NumArgRegs; ++i) 803 State.AllocateReg(AArch64ArgRegs[i]); 804 805 return false; 806} 807 808#include "AArch64GenCallingConv.inc" 809 810CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 811 812 switch(CC) { 813 default: llvm_unreachable("Unsupported calling convention"); 814 case CallingConv::Fast: 815 case CallingConv::C: 816 return CC_A64_APCS; 817 } 818} 819 820void 821AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, 822 SDLoc DL, SDValue &Chain) const { 823 MachineFunction &MF = DAG.getMachineFunction(); 824 MachineFrameInfo *MFI = MF.getFrameInfo(); 825 AArch64MachineFunctionInfo *FuncInfo 826 = MF.getInfo<AArch64MachineFunctionInfo>(); 827 828 SmallVector<SDValue, 8> MemOps; 829 830 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs, 831 NumArgRegs); 832 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs, 833 NumFPRArgRegs); 834 835 unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR); 836 int GPRIdx = 0; 837 if (GPRSaveSize != 0) { 838 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 839 840 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 841 842 for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) { 843 unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass); 844 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 845 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 846 MachinePointerInfo::getStack(i * 8), 847 false, false, 0); 848 MemOps.push_back(Store); 849 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 850 DAG.getConstant(8, getPointerTy())); 851 } 852 } 853 854 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 855 int FPRIdx = 0; 856 if (FPRSaveSize != 0) { 857 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 858 859 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 860 861 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 862 unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i], 863 &AArch64::FPR128RegClass); 864 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 865 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN, 866 MachinePointerInfo::getStack(i * 16), 867 false, false, 0); 868 MemOps.push_back(Store); 869 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 870 DAG.getConstant(16, getPointerTy())); 871 } 872 } 873 874 int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true); 875 876 FuncInfo->setVariadicStackIdx(StackIdx); 877 FuncInfo->setVariadicGPRIdx(GPRIdx); 878 FuncInfo->setVariadicGPRSize(GPRSaveSize); 879 FuncInfo->setVariadicFPRIdx(FPRIdx); 880 FuncInfo->setVariadicFPRSize(FPRSaveSize); 881 882 if (!MemOps.empty()) { 883 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 884 MemOps.size()); 885 } 886} 887 888 889SDValue 890AArch64TargetLowering::LowerFormalArguments(SDValue Chain, 891 CallingConv::ID CallConv, bool isVarArg, 892 const SmallVectorImpl<ISD::InputArg> &Ins, 893 SDLoc dl, SelectionDAG &DAG, 894 SmallVectorImpl<SDValue> &InVals) const { 895 MachineFunction &MF = DAG.getMachineFunction(); 896 AArch64MachineFunctionInfo *FuncInfo 897 = MF.getInfo<AArch64MachineFunctionInfo>(); 898 MachineFrameInfo *MFI = MF.getFrameInfo(); 899 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 900 901 SmallVector<CCValAssign, 16> ArgLocs; 902 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 903 getTargetMachine(), ArgLocs, *DAG.getContext()); 904 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 905 906 SmallVector<SDValue, 16> ArgValues; 907 908 SDValue ArgValue; 909 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 910 CCValAssign &VA = ArgLocs[i]; 911 ISD::ArgFlagsTy Flags = Ins[i].Flags; 912 913 if (Flags.isByVal()) { 914 // Byval is used for small structs and HFAs in the PCS, but the system 915 // should work in a non-compliant manner for larger structs. 916 EVT PtrTy = getPointerTy(); 917 int Size = Flags.getByValSize(); 918 unsigned NumRegs = (Size + 7) / 8; 919 920 unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs, 921 VA.getLocMemOffset(), 922 false); 923 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 924 InVals.push_back(FrameIdxN); 925 926 continue; 927 } else if (VA.isRegLoc()) { 928 MVT RegVT = VA.getLocVT(); 929 const TargetRegisterClass *RC = getRegClassFor(RegVT); 930 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 931 932 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 933 } else { // VA.isRegLoc() 934 assert(VA.isMemLoc()); 935 936 int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 937 VA.getLocMemOffset(), true); 938 939 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 940 ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, 941 MachinePointerInfo::getFixedStack(FI), 942 false, false, false, 0); 943 944 945 } 946 947 switch (VA.getLocInfo()) { 948 default: llvm_unreachable("Unknown loc info!"); 949 case CCValAssign::Full: break; 950 case CCValAssign::BCvt: 951 ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue); 952 break; 953 case CCValAssign::SExt: 954 case CCValAssign::ZExt: 955 case CCValAssign::AExt: { 956 unsigned DestSize = VA.getValVT().getSizeInBits(); 957 unsigned DestSubReg; 958 959 switch (DestSize) { 960 case 8: DestSubReg = AArch64::sub_8; break; 961 case 16: DestSubReg = AArch64::sub_16; break; 962 case 32: DestSubReg = AArch64::sub_32; break; 963 case 64: DestSubReg = AArch64::sub_64; break; 964 default: llvm_unreachable("Unexpected argument promotion"); 965 } 966 967 ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, 968 VA.getValVT(), ArgValue, 969 DAG.getTargetConstant(DestSubReg, MVT::i32)), 970 0); 971 break; 972 } 973 } 974 975 InVals.push_back(ArgValue); 976 } 977 978 if (isVarArg) 979 SaveVarArgRegisters(CCInfo, DAG, dl, Chain); 980 981 unsigned StackArgSize = CCInfo.getNextStackOffset(); 982 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 983 // This is a non-standard ABI so by fiat I say we're allowed to make full 984 // use of the stack area to be popped, which must be aligned to 16 bytes in 985 // any case: 986 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 987 988 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 989 // a multiple of 16. 990 FuncInfo->setArgumentStackToRestore(StackArgSize); 991 992 // This realignment carries over to the available bytes below. Our own 993 // callers will guarantee the space is free by giving an aligned value to 994 // CALLSEQ_START. 995 } 996 // Even if we're not expected to free up the space, it's useful to know how 997 // much is there while considering tail calls (because we can reuse it). 998 FuncInfo->setBytesInStackArgArea(StackArgSize); 999 1000 return Chain; 1001} 1002 1003SDValue 1004AArch64TargetLowering::LowerReturn(SDValue Chain, 1005 CallingConv::ID CallConv, bool isVarArg, 1006 const SmallVectorImpl<ISD::OutputArg> &Outs, 1007 const SmallVectorImpl<SDValue> &OutVals, 1008 SDLoc dl, SelectionDAG &DAG) const { 1009 // CCValAssign - represent the assignment of the return value to a location. 1010 SmallVector<CCValAssign, 16> RVLocs; 1011 1012 // CCState - Info about the registers and stack slots. 1013 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1014 getTargetMachine(), RVLocs, *DAG.getContext()); 1015 1016 // Analyze outgoing return values. 1017 CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv)); 1018 1019 SDValue Flag; 1020 SmallVector<SDValue, 4> RetOps(1, Chain); 1021 1022 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 1023 // PCS: "If the type, T, of the result of a function is such that 1024 // void func(T arg) would require that arg be passed as a value in a 1025 // register (or set of registers) according to the rules in 5.4, then the 1026 // result is returned in the same registers as would be used for such an 1027 // argument. 1028 // 1029 // Otherwise, the caller shall reserve a block of memory of sufficient 1030 // size and alignment to hold the result. The address of the memory block 1031 // shall be passed as an additional argument to the function in x8." 1032 // 1033 // This is implemented in two places. The register-return values are dealt 1034 // with here, more complex returns are passed as an sret parameter, which 1035 // means we don't have to worry about it during actual return. 1036 CCValAssign &VA = RVLocs[i]; 1037 assert(VA.isRegLoc() && "Only register-returns should be created by PCS"); 1038 1039 1040 SDValue Arg = OutVals[i]; 1041 1042 // There's no convenient note in the ABI about this as there is for normal 1043 // arguments, but it says return values are passed in the same registers as 1044 // an argument would be. I believe that includes the comments about 1045 // unspecified higher bits, putting the burden of widening on the *caller* 1046 // for return values. 1047 switch (VA.getLocInfo()) { 1048 default: llvm_unreachable("Unknown loc info"); 1049 case CCValAssign::Full: break; 1050 case CCValAssign::SExt: 1051 case CCValAssign::ZExt: 1052 case CCValAssign::AExt: 1053 // Floating-point values should only be extended when they're going into 1054 // memory, which can't happen here so an integer extend is acceptable. 1055 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1056 break; 1057 case CCValAssign::BCvt: 1058 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1059 break; 1060 } 1061 1062 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 1063 Flag = Chain.getValue(1); 1064 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 1065 } 1066 1067 RetOps[0] = Chain; // Update chain. 1068 1069 // Add the flag if we have it. 1070 if (Flag.getNode()) 1071 RetOps.push_back(Flag); 1072 1073 return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other, 1074 &RetOps[0], RetOps.size()); 1075} 1076 1077SDValue 1078AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 1079 SmallVectorImpl<SDValue> &InVals) const { 1080 SelectionDAG &DAG = CLI.DAG; 1081 SDLoc &dl = CLI.DL; 1082 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1083 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1084 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1085 SDValue Chain = CLI.Chain; 1086 SDValue Callee = CLI.Callee; 1087 bool &IsTailCall = CLI.IsTailCall; 1088 CallingConv::ID CallConv = CLI.CallConv; 1089 bool IsVarArg = CLI.IsVarArg; 1090 1091 MachineFunction &MF = DAG.getMachineFunction(); 1092 AArch64MachineFunctionInfo *FuncInfo 1093 = MF.getInfo<AArch64MachineFunctionInfo>(); 1094 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 1095 bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet(); 1096 bool IsSibCall = false; 1097 1098 if (IsTailCall) { 1099 IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1100 IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1101 Outs, OutVals, Ins, DAG); 1102 1103 // A sibling call is one where we're under the usual C ABI and not planning 1104 // to change that but can still do a tail call: 1105 if (!TailCallOpt && IsTailCall) 1106 IsSibCall = true; 1107 } 1108 1109 SmallVector<CCValAssign, 16> ArgLocs; 1110 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1111 getTargetMachine(), ArgLocs, *DAG.getContext()); 1112 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1113 1114 // On AArch64 (and all other architectures I'm aware of) the most this has to 1115 // do is adjust the stack pointer. 1116 unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16); 1117 if (IsSibCall) { 1118 // Since we're not changing the ABI to make this a tail call, the memory 1119 // operands are already available in the caller's incoming argument space. 1120 NumBytes = 0; 1121 } 1122 1123 // FPDiff is the byte offset of the call's argument area from the callee's. 1124 // Stores to callee stack arguments will be placed in FixedStackSlots offset 1125 // by this amount for a tail call. In a sibling call it must be 0 because the 1126 // caller will deallocate the entire stack and the callee still expects its 1127 // arguments to begin at SP+0. Completely unused for non-tail calls. 1128 int FPDiff = 0; 1129 1130 if (IsTailCall && !IsSibCall) { 1131 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 1132 1133 // FPDiff will be negative if this tail call requires more space than we 1134 // would automatically have in our incoming argument space. Positive if we 1135 // can actually shrink the stack. 1136 FPDiff = NumReusableBytes - NumBytes; 1137 1138 // The stack pointer must be 16-byte aligned at all times it's used for a 1139 // memory operation, which in practice means at *all* times and in 1140 // particular across call boundaries. Therefore our own arguments started at 1141 // a 16-byte aligned SP and the delta applied for the tail call should 1142 // satisfy the same constraint. 1143 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 1144 } 1145 1146 if (!IsSibCall) 1147 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), 1148 dl); 1149 1150 SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP, 1151 getPointerTy()); 1152 1153 SmallVector<SDValue, 8> MemOpChains; 1154 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1155 1156 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1157 CCValAssign &VA = ArgLocs[i]; 1158 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1159 SDValue Arg = OutVals[i]; 1160 1161 // Callee does the actual widening, so all extensions just use an implicit 1162 // definition of the rest of the Loc. Aesthetically, this would be nicer as 1163 // an ANY_EXTEND, but that isn't valid for floating-point types and this 1164 // alternative works on integer types too. 1165 switch (VA.getLocInfo()) { 1166 default: llvm_unreachable("Unknown loc info!"); 1167 case CCValAssign::Full: break; 1168 case CCValAssign::SExt: 1169 case CCValAssign::ZExt: 1170 case CCValAssign::AExt: { 1171 unsigned SrcSize = VA.getValVT().getSizeInBits(); 1172 unsigned SrcSubReg; 1173 1174 switch (SrcSize) { 1175 case 8: SrcSubReg = AArch64::sub_8; break; 1176 case 16: SrcSubReg = AArch64::sub_16; break; 1177 case 32: SrcSubReg = AArch64::sub_32; break; 1178 case 64: SrcSubReg = AArch64::sub_64; break; 1179 default: llvm_unreachable("Unexpected argument promotion"); 1180 } 1181 1182 Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, 1183 VA.getLocVT(), 1184 DAG.getUNDEF(VA.getLocVT()), 1185 Arg, 1186 DAG.getTargetConstant(SrcSubReg, MVT::i32)), 1187 0); 1188 1189 break; 1190 } 1191 case CCValAssign::BCvt: 1192 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 1193 break; 1194 } 1195 1196 if (VA.isRegLoc()) { 1197 // A normal register (sub-) argument. For now we just note it down because 1198 // we want to copy things into registers as late as possible to avoid 1199 // register-pressure (and possibly worse). 1200 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1201 continue; 1202 } 1203 1204 assert(VA.isMemLoc() && "unexpected argument location"); 1205 1206 SDValue DstAddr; 1207 MachinePointerInfo DstInfo; 1208 if (IsTailCall) { 1209 uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() : 1210 VA.getLocVT().getSizeInBits(); 1211 OpSize = (OpSize + 7) / 8; 1212 int32_t Offset = VA.getLocMemOffset() + FPDiff; 1213 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 1214 1215 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 1216 DstInfo = MachinePointerInfo::getFixedStack(FI); 1217 1218 // Make sure any stack arguments overlapping with where we're storing are 1219 // loaded before this eventual operation. Otherwise they'll be clobbered. 1220 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 1221 } else { 1222 SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1223 1224 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1225 DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset()); 1226 } 1227 1228 if (Flags.isByVal()) { 1229 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64); 1230 SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode, 1231 Flags.getByValAlign(), 1232 /*isVolatile = */ false, 1233 /*alwaysInline = */ false, 1234 DstInfo, MachinePointerInfo(0)); 1235 MemOpChains.push_back(Cpy); 1236 } else { 1237 // Normal stack argument, put it where it's needed. 1238 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo, 1239 false, false, 0); 1240 MemOpChains.push_back(Store); 1241 } 1242 } 1243 1244 // The loads and stores generated above shouldn't clash with each 1245 // other. Combining them with this TokenFactor notes that fact for the rest of 1246 // the backend. 1247 if (!MemOpChains.empty()) 1248 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1249 &MemOpChains[0], MemOpChains.size()); 1250 1251 // Most of the rest of the instructions need to be glued together; we don't 1252 // want assignments to actual registers used by a call to be rearranged by a 1253 // well-meaning scheduler. 1254 SDValue InFlag; 1255 1256 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1257 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1258 RegsToPass[i].second, InFlag); 1259 InFlag = Chain.getValue(1); 1260 } 1261 1262 // The linker is responsible for inserting veneers when necessary to put a 1263 // function call destination in range, so we don't need to bother with a 1264 // wrapper here. 1265 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1266 const GlobalValue *GV = G->getGlobal(); 1267 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy()); 1268 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1269 const char *Sym = S->getSymbol(); 1270 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 1271 } 1272 1273 // We don't usually want to end the call-sequence here because we would tidy 1274 // the frame up *after* the call, however in the ABI-changing tail-call case 1275 // we've carefully laid out the parameters so that when sp is reset they'll be 1276 // in the correct location. 1277 if (IsTailCall && !IsSibCall) { 1278 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1279 DAG.getIntPtrConstant(0, true), InFlag, dl); 1280 InFlag = Chain.getValue(1); 1281 } 1282 1283 // We produce the following DAG scheme for the actual call instruction: 1284 // (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag? 1285 // 1286 // Most arguments aren't going to be used and just keep the values live as 1287 // far as LLVM is concerned. It's expected to be selected as simply "bl 1288 // callee" (for a direct, non-tail call). 1289 std::vector<SDValue> Ops; 1290 Ops.push_back(Chain); 1291 Ops.push_back(Callee); 1292 1293 if (IsTailCall) { 1294 // Each tail call may have to adjust the stack by a different amount, so 1295 // this information must travel along with the operation for eventual 1296 // consumption by emitEpilogue. 1297 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 1298 } 1299 1300 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1301 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1302 RegsToPass[i].second.getValueType())); 1303 1304 1305 // Add a register mask operand representing the call-preserved registers. This 1306 // is used later in codegen to constrain register-allocation. 1307 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1308 const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); 1309 assert(Mask && "Missing call preserved mask for calling convention"); 1310 Ops.push_back(DAG.getRegisterMask(Mask)); 1311 1312 // If we needed glue, put it in as the last argument. 1313 if (InFlag.getNode()) 1314 Ops.push_back(InFlag); 1315 1316 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 1317 1318 if (IsTailCall) { 1319 return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); 1320 } 1321 1322 Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size()); 1323 InFlag = Chain.getValue(1); 1324 1325 // Now we can reclaim the stack, just as well do it before working out where 1326 // our return value is. 1327 if (!IsSibCall) { 1328 uint64_t CalleePopBytes 1329 = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0; 1330 1331 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1332 DAG.getIntPtrConstant(CalleePopBytes, true), 1333 InFlag, dl); 1334 InFlag = Chain.getValue(1); 1335 } 1336 1337 return LowerCallResult(Chain, InFlag, CallConv, 1338 IsVarArg, Ins, dl, DAG, InVals); 1339} 1340 1341SDValue 1342AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1343 CallingConv::ID CallConv, bool IsVarArg, 1344 const SmallVectorImpl<ISD::InputArg> &Ins, 1345 SDLoc dl, SelectionDAG &DAG, 1346 SmallVectorImpl<SDValue> &InVals) const { 1347 // Assign locations to each value returned by this call. 1348 SmallVector<CCValAssign, 16> RVLocs; 1349 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), 1350 getTargetMachine(), RVLocs, *DAG.getContext()); 1351 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv)); 1352 1353 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1354 CCValAssign VA = RVLocs[i]; 1355 1356 // Return values that are too big to fit into registers should use an sret 1357 // pointer, so this can be a lot simpler than the main argument code. 1358 assert(VA.isRegLoc() && "Memory locations not expected for call return"); 1359 1360 SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 1361 InFlag); 1362 Chain = Val.getValue(1); 1363 InFlag = Val.getValue(2); 1364 1365 switch (VA.getLocInfo()) { 1366 default: llvm_unreachable("Unknown loc info!"); 1367 case CCValAssign::Full: break; 1368 case CCValAssign::BCvt: 1369 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 1370 break; 1371 case CCValAssign::ZExt: 1372 case CCValAssign::SExt: 1373 case CCValAssign::AExt: 1374 // Floating-point arguments only get extended/truncated if they're going 1375 // in memory, so using the integer operation is acceptable here. 1376 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); 1377 break; 1378 } 1379 1380 InVals.push_back(Val); 1381 } 1382 1383 return Chain; 1384} 1385 1386bool 1387AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 1388 CallingConv::ID CalleeCC, 1389 bool IsVarArg, 1390 bool IsCalleeStructRet, 1391 bool IsCallerStructRet, 1392 const SmallVectorImpl<ISD::OutputArg> &Outs, 1393 const SmallVectorImpl<SDValue> &OutVals, 1394 const SmallVectorImpl<ISD::InputArg> &Ins, 1395 SelectionDAG& DAG) const { 1396 1397 // For CallingConv::C this function knows whether the ABI needs 1398 // changing. That's not true for other conventions so they will have to opt in 1399 // manually. 1400 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1401 return false; 1402 1403 const MachineFunction &MF = DAG.getMachineFunction(); 1404 const Function *CallerF = MF.getFunction(); 1405 CallingConv::ID CallerCC = CallerF->getCallingConv(); 1406 bool CCMatch = CallerCC == CalleeCC; 1407 1408 // Byval parameters hand the function a pointer directly into the stack area 1409 // we want to reuse during a tail call. Working around this *is* possible (see 1410 // X86) but less efficient and uglier in LowerCall. 1411 for (Function::const_arg_iterator i = CallerF->arg_begin(), 1412 e = CallerF->arg_end(); i != e; ++i) 1413 if (i->hasByValAttr()) 1414 return false; 1415 1416 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 1417 if (IsTailCallConvention(CalleeCC) && CCMatch) 1418 return true; 1419 return false; 1420 } 1421 1422 // Now we search for cases where we can use a tail call without changing the 1423 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 1424 // concept. 1425 1426 // I want anyone implementing a new calling convention to think long and hard 1427 // about this assert. 1428 assert((!IsVarArg || CalleeCC == CallingConv::C) 1429 && "Unexpected variadic calling convention"); 1430 1431 if (IsVarArg && !Outs.empty()) { 1432 // At least two cases here: if caller is fastcc then we can't have any 1433 // memory arguments (we'd be expected to clean up the stack afterwards). If 1434 // caller is C then we could potentially use its argument area. 1435 1436 // FIXME: for now we take the most conservative of these in both cases: 1437 // disallow all variadic memory operands. 1438 SmallVector<CCValAssign, 16> ArgLocs; 1439 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1440 getTargetMachine(), ArgLocs, *DAG.getContext()); 1441 1442 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1443 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 1444 if (!ArgLocs[i].isRegLoc()) 1445 return false; 1446 } 1447 1448 // If the calling conventions do not match, then we'd better make sure the 1449 // results are returned in the same way as what the caller expects. 1450 if (!CCMatch) { 1451 SmallVector<CCValAssign, 16> RVLocs1; 1452 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 1453 getTargetMachine(), RVLocs1, *DAG.getContext()); 1454 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC)); 1455 1456 SmallVector<CCValAssign, 16> RVLocs2; 1457 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 1458 getTargetMachine(), RVLocs2, *DAG.getContext()); 1459 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC)); 1460 1461 if (RVLocs1.size() != RVLocs2.size()) 1462 return false; 1463 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 1464 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 1465 return false; 1466 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 1467 return false; 1468 if (RVLocs1[i].isRegLoc()) { 1469 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 1470 return false; 1471 } else { 1472 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 1473 return false; 1474 } 1475 } 1476 } 1477 1478 // Nothing more to check if the callee is taking no arguments 1479 if (Outs.empty()) 1480 return true; 1481 1482 SmallVector<CCValAssign, 16> ArgLocs; 1483 CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(), 1484 getTargetMachine(), ArgLocs, *DAG.getContext()); 1485 1486 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 1487 1488 const AArch64MachineFunctionInfo *FuncInfo 1489 = MF.getInfo<AArch64MachineFunctionInfo>(); 1490 1491 // If the stack arguments for this call would fit into our own save area then 1492 // the call can be made tail. 1493 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 1494} 1495 1496bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 1497 bool TailCallOpt) const { 1498 return CallCC == CallingConv::Fast && TailCallOpt; 1499} 1500 1501bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 1502 return CallCC == CallingConv::Fast; 1503} 1504 1505SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 1506 SelectionDAG &DAG, 1507 MachineFrameInfo *MFI, 1508 int ClobberedFI) const { 1509 SmallVector<SDValue, 8> ArgChains; 1510 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 1511 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 1512 1513 // Include the original chain at the beginning of the list. When this is 1514 // used by target LowerCall hooks, this helps legalize find the 1515 // CALLSEQ_BEGIN node. 1516 ArgChains.push_back(Chain); 1517 1518 // Add a chain value for each stack argument corresponding 1519 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 1520 UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) 1521 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 1522 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 1523 if (FI->getIndex() < 0) { 1524 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 1525 int64_t InLastByte = InFirstByte; 1526 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 1527 1528 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1529 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1530 ArgChains.push_back(SDValue(L, 1)); 1531 } 1532 1533 // Build a tokenfactor for all the chains. 1534 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, 1535 &ArgChains[0], ArgChains.size()); 1536} 1537 1538static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) { 1539 switch (CC) { 1540 case ISD::SETEQ: return A64CC::EQ; 1541 case ISD::SETGT: return A64CC::GT; 1542 case ISD::SETGE: return A64CC::GE; 1543 case ISD::SETLT: return A64CC::LT; 1544 case ISD::SETLE: return A64CC::LE; 1545 case ISD::SETNE: return A64CC::NE; 1546 case ISD::SETUGT: return A64CC::HI; 1547 case ISD::SETUGE: return A64CC::HS; 1548 case ISD::SETULT: return A64CC::LO; 1549 case ISD::SETULE: return A64CC::LS; 1550 default: llvm_unreachable("Unexpected condition code"); 1551 } 1552} 1553 1554bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const { 1555 // icmp is implemented using adds/subs immediate, which take an unsigned 1556 // 12-bit immediate, optionally shifted left by 12 bits. 1557 1558 // Symmetric by using adds/subs 1559 if (Val < 0) 1560 Val = -Val; 1561 1562 return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0; 1563} 1564 1565SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS, 1566 ISD::CondCode CC, SDValue &A64cc, 1567 SelectionDAG &DAG, SDLoc &dl) const { 1568 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1569 int64_t C = 0; 1570 EVT VT = RHSC->getValueType(0); 1571 bool knownInvalid = false; 1572 1573 // I'm not convinced the rest of LLVM handles these edge cases properly, but 1574 // we can at least get it right. 1575 if (isSignedIntSetCC(CC)) { 1576 C = RHSC->getSExtValue(); 1577 } else if (RHSC->getZExtValue() > INT64_MAX) { 1578 // A 64-bit constant not representable by a signed 64-bit integer is far 1579 // too big to fit into a SUBS immediate anyway. 1580 knownInvalid = true; 1581 } else { 1582 C = RHSC->getZExtValue(); 1583 } 1584 1585 if (!knownInvalid && !isLegalICmpImmediate(C)) { 1586 // Constant does not fit, try adjusting it by one? 1587 switch (CC) { 1588 default: break; 1589 case ISD::SETLT: 1590 case ISD::SETGE: 1591 if (isLegalICmpImmediate(C-1)) { 1592 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1593 RHS = DAG.getConstant(C-1, VT); 1594 } 1595 break; 1596 case ISD::SETULT: 1597 case ISD::SETUGE: 1598 if (isLegalICmpImmediate(C-1)) { 1599 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1600 RHS = DAG.getConstant(C-1, VT); 1601 } 1602 break; 1603 case ISD::SETLE: 1604 case ISD::SETGT: 1605 if (isLegalICmpImmediate(C+1)) { 1606 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1607 RHS = DAG.getConstant(C+1, VT); 1608 } 1609 break; 1610 case ISD::SETULE: 1611 case ISD::SETUGT: 1612 if (isLegalICmpImmediate(C+1)) { 1613 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1614 RHS = DAG.getConstant(C+1, VT); 1615 } 1616 break; 1617 } 1618 } 1619 } 1620 1621 A64CC::CondCodes CondCode = IntCCToA64CC(CC); 1622 A64cc = DAG.getConstant(CondCode, MVT::i32); 1623 return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1624 DAG.getCondCode(CC)); 1625} 1626 1627static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC, 1628 A64CC::CondCodes &Alternative) { 1629 A64CC::CondCodes CondCode = A64CC::Invalid; 1630 Alternative = A64CC::Invalid; 1631 1632 switch (CC) { 1633 default: llvm_unreachable("Unknown FP condition!"); 1634 case ISD::SETEQ: 1635 case ISD::SETOEQ: CondCode = A64CC::EQ; break; 1636 case ISD::SETGT: 1637 case ISD::SETOGT: CondCode = A64CC::GT; break; 1638 case ISD::SETGE: 1639 case ISD::SETOGE: CondCode = A64CC::GE; break; 1640 case ISD::SETOLT: CondCode = A64CC::MI; break; 1641 case ISD::SETOLE: CondCode = A64CC::LS; break; 1642 case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break; 1643 case ISD::SETO: CondCode = A64CC::VC; break; 1644 case ISD::SETUO: CondCode = A64CC::VS; break; 1645 case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break; 1646 case ISD::SETUGT: CondCode = A64CC::HI; break; 1647 case ISD::SETUGE: CondCode = A64CC::PL; break; 1648 case ISD::SETLT: 1649 case ISD::SETULT: CondCode = A64CC::LT; break; 1650 case ISD::SETLE: 1651 case ISD::SETULE: CondCode = A64CC::LE; break; 1652 case ISD::SETNE: 1653 case ISD::SETUNE: CondCode = A64CC::NE; break; 1654 } 1655 return CondCode; 1656} 1657 1658SDValue 1659AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 1660 SDLoc DL(Op); 1661 EVT PtrVT = getPointerTy(); 1662 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 1663 1664 switch(getTargetMachine().getCodeModel()) { 1665 case CodeModel::Small: 1666 // The most efficient code is PC-relative anyway for the small memory model, 1667 // so we don't need to worry about relocation model. 1668 return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 1669 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1670 AArch64II::MO_NO_FLAG), 1671 DAG.getTargetBlockAddress(BA, PtrVT, 0, 1672 AArch64II::MO_LO12), 1673 DAG.getConstant(/*Alignment=*/ 4, MVT::i32)); 1674 case CodeModel::Large: 1675 return DAG.getNode( 1676 AArch64ISD::WrapperLarge, DL, PtrVT, 1677 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3), 1678 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 1679 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 1680 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 1681 default: 1682 llvm_unreachable("Only small and large code models supported now"); 1683 } 1684} 1685 1686 1687// (BRCOND chain, val, dest) 1688SDValue 1689AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1690 SDLoc dl(Op); 1691 SDValue Chain = Op.getOperand(0); 1692 SDValue TheBit = Op.getOperand(1); 1693 SDValue DestBB = Op.getOperand(2); 1694 1695 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 1696 // that as the consumer we are responsible for ignoring rubbish in higher 1697 // bits. 1698 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 1699 DAG.getConstant(1, MVT::i32)); 1700 1701 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 1702 DAG.getConstant(0, TheBit.getValueType()), 1703 DAG.getCondCode(ISD::SETNE)); 1704 1705 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain, 1706 A64CMP, DAG.getConstant(A64CC::NE, MVT::i32), 1707 DestBB); 1708} 1709 1710// (BR_CC chain, condcode, lhs, rhs, dest) 1711SDValue 1712AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 1713 SDLoc dl(Op); 1714 SDValue Chain = Op.getOperand(0); 1715 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 1716 SDValue LHS = Op.getOperand(2); 1717 SDValue RHS = Op.getOperand(3); 1718 SDValue DestBB = Op.getOperand(4); 1719 1720 if (LHS.getValueType() == MVT::f128) { 1721 // f128 comparisons are lowered to runtime calls by a routine which sets 1722 // LHS, RHS and CC appropriately for the rest of this function to continue. 1723 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 1724 1725 // If softenSetCCOperands returned a scalar, we need to compare the result 1726 // against zero to select between true and false values. 1727 if (RHS.getNode() == 0) { 1728 RHS = DAG.getConstant(0, LHS.getValueType()); 1729 CC = ISD::SETNE; 1730 } 1731 } 1732 1733 if (LHS.getValueType().isInteger()) { 1734 SDValue A64cc; 1735 1736 // Integers are handled in a separate function because the combinations of 1737 // immediates and tests can get hairy and we may want to fiddle things. 1738 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 1739 1740 return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1741 Chain, CmpOp, A64cc, DestBB); 1742 } 1743 1744 // Note that some LLVM floating-point CondCodes can't be lowered to a single 1745 // conditional branch, hence FPCCToA64CC can set a second test, where either 1746 // passing is sufficient. 1747 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 1748 CondCode = FPCCToA64CC(CC, Alternative); 1749 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 1750 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 1751 DAG.getCondCode(CC)); 1752 SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1753 Chain, SetCC, A64cc, DestBB); 1754 1755 if (Alternative != A64CC::Invalid) { 1756 A64cc = DAG.getConstant(Alternative, MVT::i32); 1757 A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, 1758 A64BR_CC, SetCC, A64cc, DestBB); 1759 1760 } 1761 1762 return A64BR_CC; 1763} 1764 1765SDValue 1766AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG, 1767 RTLIB::Libcall Call) const { 1768 ArgListTy Args; 1769 ArgListEntry Entry; 1770 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { 1771 EVT ArgVT = Op.getOperand(i).getValueType(); 1772 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1773 Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; 1774 Entry.isSExt = false; 1775 Entry.isZExt = false; 1776 Args.push_back(Entry); 1777 } 1778 SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy()); 1779 1780 Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext()); 1781 1782 // By default, the input chain to this libcall is the entry node of the 1783 // function. If the libcall is going to be emitted as a tail call then 1784 // isUsedByReturnOnly will change it to the right chain if the return 1785 // node which is being folded has a non-entry input chain. 1786 SDValue InChain = DAG.getEntryNode(); 1787 1788 // isTailCall may be true since the callee does not reference caller stack 1789 // frame. Check if it's in the right position. 1790 SDValue TCChain = InChain; 1791 bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain); 1792 if (isTailCall) 1793 InChain = TCChain; 1794 1795 TargetLowering:: 1796 CallLoweringInfo CLI(InChain, RetTy, false, false, false, false, 1797 0, getLibcallCallingConv(Call), isTailCall, 1798 /*doesNotReturn=*/false, /*isReturnValueUsed=*/true, 1799 Callee, Args, DAG, SDLoc(Op)); 1800 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 1801 1802 if (!CallInfo.second.getNode()) 1803 // It's a tailcall, return the chain (which is the DAG root). 1804 return DAG.getRoot(); 1805 1806 return CallInfo.first; 1807} 1808 1809SDValue 1810AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 1811 if (Op.getOperand(0).getValueType() != MVT::f128) { 1812 // It's legal except when f128 is involved 1813 return Op; 1814 } 1815 1816 RTLIB::Libcall LC; 1817 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1818 1819 SDValue SrcVal = Op.getOperand(0); 1820 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 1821 /*isSigned*/ false, SDLoc(Op)); 1822} 1823 1824SDValue 1825AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 1826 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1827 1828 RTLIB::Libcall LC; 1829 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1830 1831 return LowerF128ToCall(Op, DAG, LC); 1832} 1833 1834SDValue 1835AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, 1836 bool IsSigned) const { 1837 if (Op.getOperand(0).getValueType() != MVT::f128) { 1838 // It's legal except when f128 is involved 1839 return Op; 1840 } 1841 1842 RTLIB::Libcall LC; 1843 if (IsSigned) 1844 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1845 else 1846 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1847 1848 return LowerF128ToCall(Op, DAG, LC); 1849} 1850 1851SDValue 1852AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op, 1853 SelectionDAG &DAG) const { 1854 assert(getTargetMachine().getCodeModel() == CodeModel::Large); 1855 assert(getTargetMachine().getRelocationModel() == Reloc::Static); 1856 1857 EVT PtrVT = getPointerTy(); 1858 SDLoc dl(Op); 1859 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 1860 const GlobalValue *GV = GN->getGlobal(); 1861 1862 SDValue GlobalAddr = DAG.getNode( 1863 AArch64ISD::WrapperLarge, dl, PtrVT, 1864 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3), 1865 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC), 1866 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC), 1867 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC)); 1868 1869 if (GN->getOffset() != 0) 1870 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 1871 DAG.getConstant(GN->getOffset(), PtrVT)); 1872 1873 return GlobalAddr; 1874} 1875 1876SDValue 1877AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op, 1878 SelectionDAG &DAG) const { 1879 assert(getTargetMachine().getCodeModel() == CodeModel::Small); 1880 1881 EVT PtrVT = getPointerTy(); 1882 SDLoc dl(Op); 1883 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 1884 const GlobalValue *GV = GN->getGlobal(); 1885 unsigned Alignment = GV->getAlignment(); 1886 Reloc::Model RelocM = getTargetMachine().getRelocationModel(); 1887 if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) { 1888 // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate 1889 // to zero when they remain undefined. In PIC mode the GOT can take care of 1890 // this, but in absolute mode we use a constant pool load. 1891 SDValue PoolAddr; 1892 PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1893 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1894 AArch64II::MO_NO_FLAG), 1895 DAG.getTargetConstantPool(GV, PtrVT, 0, 0, 1896 AArch64II::MO_LO12), 1897 DAG.getConstant(8, MVT::i32)); 1898 SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr, 1899 MachinePointerInfo::getConstantPool(), 1900 /*isVolatile=*/ false, 1901 /*isNonTemporal=*/ true, 1902 /*isInvariant=*/ true, 8); 1903 if (GN->getOffset() != 0) 1904 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr, 1905 DAG.getConstant(GN->getOffset(), PtrVT)); 1906 1907 return GlobalAddr; 1908 } 1909 1910 if (Alignment == 0) { 1911 const PointerType *GVPtrTy = cast<PointerType>(GV->getType()); 1912 if (GVPtrTy->getElementType()->isSized()) { 1913 Alignment 1914 = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType()); 1915 } else { 1916 // Be conservative if we can't guess, not that it really matters: 1917 // functions and labels aren't valid for loads, and the methods used to 1918 // actually calculate an address work with any alignment. 1919 Alignment = 1; 1920 } 1921 } 1922 1923 unsigned char HiFixup, LoFixup; 1924 bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM); 1925 1926 if (UseGOT) { 1927 HiFixup = AArch64II::MO_GOT; 1928 LoFixup = AArch64II::MO_GOT_LO12; 1929 Alignment = 8; 1930 } else { 1931 HiFixup = AArch64II::MO_NO_FLAG; 1932 LoFixup = AArch64II::MO_LO12; 1933 } 1934 1935 // AArch64's small model demands the following sequence: 1936 // ADRP x0, somewhere 1937 // ADD x0, x0, #:lo12:somewhere ; (or LDR directly). 1938 SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 1939 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1940 HiFixup), 1941 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 1942 LoFixup), 1943 DAG.getConstant(Alignment, MVT::i32)); 1944 1945 if (UseGOT) { 1946 GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(), 1947 GlobalRef); 1948 } 1949 1950 if (GN->getOffset() != 0) 1951 return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef, 1952 DAG.getConstant(GN->getOffset(), PtrVT)); 1953 1954 return GlobalRef; 1955} 1956 1957SDValue 1958AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op, 1959 SelectionDAG &DAG) const { 1960 // TableGen doesn't have easy access to the CodeModel or RelocationModel, so 1961 // we make those distinctions here. 1962 1963 switch (getTargetMachine().getCodeModel()) { 1964 case CodeModel::Small: 1965 return LowerGlobalAddressELFSmall(Op, DAG); 1966 case CodeModel::Large: 1967 return LowerGlobalAddressELFLarge(Op, DAG); 1968 default: 1969 llvm_unreachable("Only small and large code models supported now"); 1970 } 1971} 1972 1973SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr, 1974 SDValue DescAddr, 1975 SDLoc DL, 1976 SelectionDAG &DAG) const { 1977 EVT PtrVT = getPointerTy(); 1978 1979 // The function we need to call is simply the first entry in the GOT for this 1980 // descriptor, load it in preparation. 1981 SDValue Func, Chain; 1982 Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 1983 DescAddr); 1984 1985 // The function takes only one argument: the address of the descriptor itself 1986 // in X0. 1987 SDValue Glue; 1988 Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); 1989 Glue = Chain.getValue(1); 1990 1991 // Finally, there's a special calling-convention which means that the lookup 1992 // must preserve all registers (except X0, obviously). 1993 const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1994 const AArch64RegisterInfo *A64RI 1995 = static_cast<const AArch64RegisterInfo *>(TRI); 1996 const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask(); 1997 1998 // We're now ready to populate the argument list, as with a normal call: 1999 std::vector<SDValue> Ops; 2000 Ops.push_back(Chain); 2001 Ops.push_back(Func); 2002 Ops.push_back(SymAddr); 2003 Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); 2004 Ops.push_back(DAG.getRegisterMask(Mask)); 2005 Ops.push_back(Glue); 2006 2007 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2008 Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0], 2009 Ops.size()); 2010 Glue = Chain.getValue(1); 2011 2012 // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it 2013 // back to the generic handling code. 2014 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 2015} 2016 2017SDValue 2018AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 2019 SelectionDAG &DAG) const { 2020 assert(getSubtarget()->isTargetELF() && 2021 "TLS not implemented for non-ELF targets"); 2022 assert(getTargetMachine().getCodeModel() == CodeModel::Small 2023 && "TLS only supported in small memory model"); 2024 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 2025 2026 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 2027 2028 SDValue TPOff; 2029 EVT PtrVT = getPointerTy(); 2030 SDLoc DL(Op); 2031 const GlobalValue *GV = GA->getGlobal(); 2032 2033 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 2034 2035 if (Model == TLSModel::InitialExec) { 2036 TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2037 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2038 AArch64II::MO_GOTTPREL), 2039 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2040 AArch64II::MO_GOTTPREL_LO12), 2041 DAG.getConstant(8, MVT::i32)); 2042 TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(), 2043 TPOff); 2044 } else if (Model == TLSModel::LocalExec) { 2045 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2046 AArch64II::MO_TPREL_G1); 2047 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2048 AArch64II::MO_TPREL_G0_NC); 2049 2050 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2051 DAG.getTargetConstant(0, MVT::i32)), 0); 2052 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2053 TPOff, LoVar, 2054 DAG.getTargetConstant(0, MVT::i32)), 0); 2055 } else if (Model == TLSModel::GeneralDynamic) { 2056 // Accesses used in this sequence go via the TLS descriptor which lives in 2057 // the GOT. Prepare an address we can use to handle this. 2058 SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2059 AArch64II::MO_TLSDESC); 2060 SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2061 AArch64II::MO_TLSDESC_LO12); 2062 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2063 HiDesc, LoDesc, 2064 DAG.getConstant(8, MVT::i32)); 2065 SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0); 2066 2067 TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2068 } else if (Model == TLSModel::LocalDynamic) { 2069 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 2070 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 2071 // the beginning of the module's TLS region, followed by a DTPREL offset 2072 // calculation. 2073 2074 // These accesses will need deduplicating if there's more than one. 2075 AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction() 2076 .getInfo<AArch64MachineFunctionInfo>(); 2077 MFI->incNumLocalDynamicTLSAccesses(); 2078 2079 2080 // Get the location of _TLS_MODULE_BASE_: 2081 SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2082 AArch64II::MO_TLSDESC); 2083 SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 2084 AArch64II::MO_TLSDESC_LO12); 2085 SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT, 2086 HiDesc, LoDesc, 2087 DAG.getConstant(8, MVT::i32)); 2088 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT); 2089 2090 ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG); 2091 2092 // Get the variable's offset from _TLS_MODULE_BASE_ 2093 SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2094 AArch64II::MO_DTPREL_G1); 2095 SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0, 2096 AArch64II::MO_DTPREL_G0_NC); 2097 2098 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar, 2099 DAG.getTargetConstant(0, MVT::i32)), 0); 2100 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT, 2101 TPOff, LoVar, 2102 DAG.getTargetConstant(0, MVT::i32)), 0); 2103 } else 2104 llvm_unreachable("Unsupported TLS access model"); 2105 2106 2107 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 2108} 2109 2110SDValue 2111AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, 2112 bool IsSigned) const { 2113 if (Op.getValueType() != MVT::f128) { 2114 // Legal for everything except f128. 2115 return Op; 2116 } 2117 2118 RTLIB::Libcall LC; 2119 if (IsSigned) 2120 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2121 else 2122 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 2123 2124 return LowerF128ToCall(Op, DAG, LC); 2125} 2126 2127 2128SDValue 2129AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 2130 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 2131 SDLoc dl(JT); 2132 EVT PtrVT = getPointerTy(); 2133 2134 // When compiling PIC, jump tables get put in the code section so a static 2135 // relocation-style is acceptable for both cases. 2136 switch (getTargetMachine().getCodeModel()) { 2137 case CodeModel::Small: 2138 return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT, 2139 DAG.getTargetJumpTable(JT->getIndex(), PtrVT), 2140 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 2141 AArch64II::MO_LO12), 2142 DAG.getConstant(1, MVT::i32)); 2143 case CodeModel::Large: 2144 return DAG.getNode( 2145 AArch64ISD::WrapperLarge, dl, PtrVT, 2146 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3), 2147 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC), 2148 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC), 2149 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC)); 2150 default: 2151 llvm_unreachable("Only small and large code models supported now"); 2152 } 2153} 2154 2155// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode) 2156SDValue 2157AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 2158 SDLoc dl(Op); 2159 SDValue LHS = Op.getOperand(0); 2160 SDValue RHS = Op.getOperand(1); 2161 SDValue IfTrue = Op.getOperand(2); 2162 SDValue IfFalse = Op.getOperand(3); 2163 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 2164 2165 if (LHS.getValueType() == MVT::f128) { 2166 // f128 comparisons are lowered to libcalls, but slot in nicely here 2167 // afterwards. 2168 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2169 2170 // If softenSetCCOperands returned a scalar, we need to compare the result 2171 // against zero to select between true and false values. 2172 if (RHS.getNode() == 0) { 2173 RHS = DAG.getConstant(0, LHS.getValueType()); 2174 CC = ISD::SETNE; 2175 } 2176 } 2177 2178 if (LHS.getValueType().isInteger()) { 2179 SDValue A64cc; 2180 2181 // Integers are handled in a separate function because the combinations of 2182 // immediates and tests can get hairy and we may want to fiddle things. 2183 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2184 2185 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2186 CmpOp, IfTrue, IfFalse, A64cc); 2187 } 2188 2189 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2190 // conditional branch, hence FPCCToA64CC can set a second test, where either 2191 // passing is sufficient. 2192 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2193 CondCode = FPCCToA64CC(CC, Alternative); 2194 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2195 SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2196 DAG.getCondCode(CC)); 2197 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, 2198 Op.getValueType(), 2199 SetCC, IfTrue, IfFalse, A64cc); 2200 2201 if (Alternative != A64CC::Invalid) { 2202 A64cc = DAG.getConstant(Alternative, MVT::i32); 2203 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2204 SetCC, IfTrue, A64SELECT_CC, A64cc); 2205 2206 } 2207 2208 return A64SELECT_CC; 2209} 2210 2211// (SELECT testbit, iftrue, iffalse) 2212SDValue 2213AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 2214 SDLoc dl(Op); 2215 SDValue TheBit = Op.getOperand(0); 2216 SDValue IfTrue = Op.getOperand(1); 2217 SDValue IfFalse = Op.getOperand(2); 2218 2219 // AArch64 BooleanContents is the default UndefinedBooleanContent, which means 2220 // that as the consumer we are responsible for ignoring rubbish in higher 2221 // bits. 2222 TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit, 2223 DAG.getConstant(1, MVT::i32)); 2224 SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit, 2225 DAG.getConstant(0, TheBit.getValueType()), 2226 DAG.getCondCode(ISD::SETNE)); 2227 2228 return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), 2229 A64CMP, IfTrue, IfFalse, 2230 DAG.getConstant(A64CC::NE, MVT::i32)); 2231} 2232 2233// (SETCC lhs, rhs, condcode) 2234SDValue 2235AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 2236 SDLoc dl(Op); 2237 SDValue LHS = Op.getOperand(0); 2238 SDValue RHS = Op.getOperand(1); 2239 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 2240 EVT VT = Op.getValueType(); 2241 2242 if (LHS.getValueType() == MVT::f128) { 2243 // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS 2244 // for the rest of the function (some i32 or i64 values). 2245 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 2246 2247 // If softenSetCCOperands returned a scalar, use it. 2248 if (RHS.getNode() == 0) { 2249 assert(LHS.getValueType() == Op.getValueType() && 2250 "Unexpected setcc expansion!"); 2251 return LHS; 2252 } 2253 } 2254 2255 if (LHS.getValueType().isInteger()) { 2256 SDValue A64cc; 2257 2258 // Integers are handled in a separate function because the combinations of 2259 // immediates and tests can get hairy and we may want to fiddle things. 2260 SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl); 2261 2262 return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2263 CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT), 2264 A64cc); 2265 } 2266 2267 // Note that some LLVM floating-point CondCodes can't be lowered to a single 2268 // conditional branch, hence FPCCToA64CC can set a second test, where either 2269 // passing is sufficient. 2270 A64CC::CondCodes CondCode, Alternative = A64CC::Invalid; 2271 CondCode = FPCCToA64CC(CC, Alternative); 2272 SDValue A64cc = DAG.getConstant(CondCode, MVT::i32); 2273 SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS, 2274 DAG.getCondCode(CC)); 2275 SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, 2276 CmpOp, DAG.getConstant(1, VT), 2277 DAG.getConstant(0, VT), A64cc); 2278 2279 if (Alternative != A64CC::Invalid) { 2280 A64cc = DAG.getConstant(Alternative, MVT::i32); 2281 A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp, 2282 DAG.getConstant(1, VT), A64SELECT_CC, A64cc); 2283 } 2284 2285 return A64SELECT_CC; 2286} 2287 2288SDValue 2289AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 2290 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2291 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 2292 2293 // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes 2294 // rather than just 8. 2295 return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), 2296 Op.getOperand(1), Op.getOperand(2), 2297 DAG.getConstant(32, MVT::i32), 8, false, false, 2298 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); 2299} 2300 2301SDValue 2302AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2303 // The layout of the va_list struct is specified in the AArch64 Procedure Call 2304 // Standard, section B.3. 2305 MachineFunction &MF = DAG.getMachineFunction(); 2306 AArch64MachineFunctionInfo *FuncInfo 2307 = MF.getInfo<AArch64MachineFunctionInfo>(); 2308 SDLoc DL(Op); 2309 2310 SDValue Chain = Op.getOperand(0); 2311 SDValue VAList = Op.getOperand(1); 2312 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2313 SmallVector<SDValue, 4> MemOps; 2314 2315 // void *__stack at offset 0 2316 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(), 2317 getPointerTy()); 2318 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 2319 MachinePointerInfo(SV), false, false, 0)); 2320 2321 // void *__gr_top at offset 8 2322 int GPRSize = FuncInfo->getVariadicGPRSize(); 2323 if (GPRSize > 0) { 2324 SDValue GRTop, GRTopAddr; 2325 2326 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2327 DAG.getConstant(8, getPointerTy())); 2328 2329 GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy()); 2330 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 2331 DAG.getConstant(GPRSize, getPointerTy())); 2332 2333 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 2334 MachinePointerInfo(SV, 8), 2335 false, false, 0)); 2336 } 2337 2338 // void *__vr_top at offset 16 2339 int FPRSize = FuncInfo->getVariadicFPRSize(); 2340 if (FPRSize > 0) { 2341 SDValue VRTop, VRTopAddr; 2342 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2343 DAG.getConstant(16, getPointerTy())); 2344 2345 VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy()); 2346 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 2347 DAG.getConstant(FPRSize, getPointerTy())); 2348 2349 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 2350 MachinePointerInfo(SV, 16), 2351 false, false, 0)); 2352 } 2353 2354 // int __gr_offs at offset 24 2355 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2356 DAG.getConstant(24, getPointerTy())); 2357 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 2358 GROffsAddr, MachinePointerInfo(SV, 24), 2359 false, false, 0)); 2360 2361 // int __vr_offs at offset 28 2362 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 2363 DAG.getConstant(28, getPointerTy())); 2364 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 2365 VROffsAddr, MachinePointerInfo(SV, 28), 2366 false, false, 0)); 2367 2368 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0], 2369 MemOps.size()); 2370} 2371 2372SDValue 2373AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2374 switch (Op.getOpcode()) { 2375 default: llvm_unreachable("Don't know how to custom lower this!"); 2376 case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128); 2377 case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128); 2378 case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128); 2379 case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128); 2380 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true); 2381 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false); 2382 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true); 2383 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false); 2384 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 2385 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 2386 2387 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 2388 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 2389 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 2390 case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG); 2391 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 2392 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 2393 case ISD::SELECT: return LowerSELECT(Op, DAG); 2394 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 2395 case ISD::SETCC: return LowerSETCC(Op, DAG); 2396 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 2397 case ISD::VASTART: return LowerVASTART(Op, DAG); 2398 } 2399 2400 return SDValue(); 2401} 2402 2403static SDValue PerformANDCombine(SDNode *N, 2404 TargetLowering::DAGCombinerInfo &DCI) { 2405 2406 SelectionDAG &DAG = DCI.DAG; 2407 SDLoc DL(N); 2408 EVT VT = N->getValueType(0); 2409 2410 // We're looking for an SRA/SHL pair which form an SBFX. 2411 2412 if (VT != MVT::i32 && VT != MVT::i64) 2413 return SDValue(); 2414 2415 if (!isa<ConstantSDNode>(N->getOperand(1))) 2416 return SDValue(); 2417 2418 uint64_t TruncMask = N->getConstantOperandVal(1); 2419 if (!isMask_64(TruncMask)) 2420 return SDValue(); 2421 2422 uint64_t Width = CountPopulation_64(TruncMask); 2423 SDValue Shift = N->getOperand(0); 2424 2425 if (Shift.getOpcode() != ISD::SRL) 2426 return SDValue(); 2427 2428 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 2429 return SDValue(); 2430 uint64_t LSB = Shift->getConstantOperandVal(1); 2431 2432 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 2433 return SDValue(); 2434 2435 return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0), 2436 DAG.getConstant(LSB, MVT::i64), 2437 DAG.getConstant(LSB + Width - 1, MVT::i64)); 2438} 2439 2440/// For a true bitfield insert, the bits getting into that contiguous mask 2441/// should come from the low part of an existing value: they must be formed from 2442/// a compatible SHL operation (unless they're already low). This function 2443/// checks that condition and returns the least-significant bit that's 2444/// intended. If the operation not a field preparation, -1 is returned. 2445static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT, 2446 SDValue &MaskedVal, uint64_t Mask) { 2447 if (!isShiftedMask_64(Mask)) 2448 return -1; 2449 2450 // Now we need to alter MaskedVal so that it is an appropriate input for a BFI 2451 // instruction. BFI will do a left-shift by LSB before applying the mask we've 2452 // spotted, so in general we should pre-emptively "undo" that by making sure 2453 // the incoming bits have had a right-shift applied to them. 2454 // 2455 // This right shift, however, will combine with existing left/right shifts. In 2456 // the simplest case of a completely straight bitfield operation, it will be 2457 // expected to completely cancel out with an existing SHL. More complicated 2458 // cases (e.g. bitfield to bitfield copy) may still need a real shift before 2459 // the BFI. 2460 2461 uint64_t LSB = countTrailingZeros(Mask); 2462 int64_t ShiftRightRequired = LSB; 2463 if (MaskedVal.getOpcode() == ISD::SHL && 2464 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2465 ShiftRightRequired -= MaskedVal.getConstantOperandVal(1); 2466 MaskedVal = MaskedVal.getOperand(0); 2467 } else if (MaskedVal.getOpcode() == ISD::SRL && 2468 isa<ConstantSDNode>(MaskedVal.getOperand(1))) { 2469 ShiftRightRequired += MaskedVal.getConstantOperandVal(1); 2470 MaskedVal = MaskedVal.getOperand(0); 2471 } 2472 2473 if (ShiftRightRequired > 0) 2474 MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal, 2475 DAG.getConstant(ShiftRightRequired, MVT::i64)); 2476 else if (ShiftRightRequired < 0) { 2477 // We could actually end up with a residual left shift, for example with 2478 // "struc.bitfield = val << 1". 2479 MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal, 2480 DAG.getConstant(-ShiftRightRequired, MVT::i64)); 2481 } 2482 2483 return LSB; 2484} 2485 2486/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by 2487/// a mask and an extension. Returns true if a BFI was found and provides 2488/// information on its surroundings. 2489static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask, 2490 bool &Extended) { 2491 Extended = false; 2492 if (N.getOpcode() == ISD::ZERO_EXTEND) { 2493 Extended = true; 2494 N = N.getOperand(0); 2495 } 2496 2497 if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) { 2498 Mask = N->getConstantOperandVal(1); 2499 N = N.getOperand(0); 2500 } else { 2501 // Mask is the whole width. 2502 Mask = -1ULL >> (64 - N.getValueType().getSizeInBits()); 2503 } 2504 2505 if (N.getOpcode() == AArch64ISD::BFI) { 2506 BFI = N; 2507 return true; 2508 } 2509 2510 return false; 2511} 2512 2513/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which 2514/// is roughly equivalent to (and (BFI ...), mask). This form is used because it 2515/// can often be further combined with a larger mask. Ultimately, we want mask 2516/// to be 2^32-1 or 2^64-1 so the AND can be skipped. 2517static SDValue tryCombineToBFI(SDNode *N, 2518 TargetLowering::DAGCombinerInfo &DCI, 2519 const AArch64Subtarget *Subtarget) { 2520 SelectionDAG &DAG = DCI.DAG; 2521 SDLoc DL(N); 2522 EVT VT = N->getValueType(0); 2523 2524 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 2525 2526 // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or 2527 // abandon the effort. 2528 SDValue LHS = N->getOperand(0); 2529 if (LHS.getOpcode() != ISD::AND) 2530 return SDValue(); 2531 2532 uint64_t LHSMask; 2533 if (isa<ConstantSDNode>(LHS.getOperand(1))) 2534 LHSMask = LHS->getConstantOperandVal(1); 2535 else 2536 return SDValue(); 2537 2538 // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask 2539 // is or abandon the effort. 2540 SDValue RHS = N->getOperand(1); 2541 if (RHS.getOpcode() != ISD::AND) 2542 return SDValue(); 2543 2544 uint64_t RHSMask; 2545 if (isa<ConstantSDNode>(RHS.getOperand(1))) 2546 RHSMask = RHS->getConstantOperandVal(1); 2547 else 2548 return SDValue(); 2549 2550 // Can't do anything if the masks are incompatible. 2551 if (LHSMask & RHSMask) 2552 return SDValue(); 2553 2554 // Now we need one of the masks to be a contiguous field. Without loss of 2555 // generality that should be the RHS one. 2556 SDValue Bitfield = LHS.getOperand(0); 2557 if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) { 2558 // We know that LHS is a candidate new value, and RHS isn't already a better 2559 // one. 2560 std::swap(LHS, RHS); 2561 std::swap(LHSMask, RHSMask); 2562 } 2563 2564 // We've done our best to put the right operands in the right places, all we 2565 // can do now is check whether a BFI exists. 2566 Bitfield = RHS.getOperand(0); 2567 int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask); 2568 if (LSB == -1) 2569 return SDValue(); 2570 2571 uint32_t Width = CountPopulation_64(RHSMask); 2572 assert(Width && "Expected non-zero bitfield width"); 2573 2574 SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 2575 LHS.getOperand(0), Bitfield, 2576 DAG.getConstant(LSB, MVT::i64), 2577 DAG.getConstant(Width, MVT::i64)); 2578 2579 // Mask is trivial 2580 if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 2581 return BFI; 2582 2583 return DAG.getNode(ISD::AND, DL, VT, BFI, 2584 DAG.getConstant(LHSMask | RHSMask, VT)); 2585} 2586 2587/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its 2588/// original input. This is surprisingly common because SROA splits things up 2589/// into i8 chunks, so the originally detected MaskedBFI may actually only act 2590/// on the low (say) byte of a word. This is then orred into the rest of the 2591/// word afterwards. 2592/// 2593/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)). 2594/// 2595/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the 2596/// MaskedBFI. We can also deal with a certain amount of extend/truncate being 2597/// involved. 2598static SDValue tryCombineToLargerBFI(SDNode *N, 2599 TargetLowering::DAGCombinerInfo &DCI, 2600 const AArch64Subtarget *Subtarget) { 2601 SelectionDAG &DAG = DCI.DAG; 2602 SDLoc DL(N); 2603 EVT VT = N->getValueType(0); 2604 2605 // First job is to hunt for a MaskedBFI on either the left or right. Swap 2606 // operands if it's actually on the right. 2607 SDValue BFI; 2608 SDValue PossExtraMask; 2609 uint64_t ExistingMask = 0; 2610 bool Extended = false; 2611 if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended)) 2612 PossExtraMask = N->getOperand(1); 2613 else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended)) 2614 PossExtraMask = N->getOperand(0); 2615 else 2616 return SDValue(); 2617 2618 // We can only combine a BFI with another compatible mask. 2619 if (PossExtraMask.getOpcode() != ISD::AND || 2620 !isa<ConstantSDNode>(PossExtraMask.getOperand(1))) 2621 return SDValue(); 2622 2623 uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1); 2624 2625 // Masks must be compatible. 2626 if (ExtraMask & ExistingMask) 2627 return SDValue(); 2628 2629 SDValue OldBFIVal = BFI.getOperand(0); 2630 SDValue NewBFIVal = BFI.getOperand(1); 2631 if (Extended) { 2632 // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be 2633 // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments 2634 // need to be made compatible. 2635 assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32 2636 && "Invalid types for BFI"); 2637 OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal); 2638 NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal); 2639 } 2640 2641 // We need the MaskedBFI to be combined with a mask of the *same* value. 2642 if (PossExtraMask.getOperand(0) != OldBFIVal) 2643 return SDValue(); 2644 2645 BFI = DAG.getNode(AArch64ISD::BFI, DL, VT, 2646 OldBFIVal, NewBFIVal, 2647 BFI.getOperand(2), BFI.getOperand(3)); 2648 2649 // If the masking is trivial, we don't need to create it. 2650 if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits()))) 2651 return BFI; 2652 2653 return DAG.getNode(ISD::AND, DL, VT, BFI, 2654 DAG.getConstant(ExtraMask | ExistingMask, VT)); 2655} 2656 2657/// An EXTR instruction is made up of two shifts, ORed together. This helper 2658/// searches for and classifies those shifts. 2659static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 2660 bool &FromHi) { 2661 if (N.getOpcode() == ISD::SHL) 2662 FromHi = false; 2663 else if (N.getOpcode() == ISD::SRL) 2664 FromHi = true; 2665 else 2666 return false; 2667 2668 if (!isa<ConstantSDNode>(N.getOperand(1))) 2669 return false; 2670 2671 ShiftAmount = N->getConstantOperandVal(1); 2672 Src = N->getOperand(0); 2673 return true; 2674} 2675 2676/// EXTR instruction extracts a contiguous chunk of bits from two existing 2677/// registers viewed as a high/low pair. This function looks for the pattern: 2678/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 2679/// EXTR. Can't quite be done in TableGen because the two immediates aren't 2680/// independent. 2681static SDValue tryCombineToEXTR(SDNode *N, 2682 TargetLowering::DAGCombinerInfo &DCI) { 2683 SelectionDAG &DAG = DCI.DAG; 2684 SDLoc DL(N); 2685 EVT VT = N->getValueType(0); 2686 2687 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 2688 2689 if (VT != MVT::i32 && VT != MVT::i64) 2690 return SDValue(); 2691 2692 SDValue LHS; 2693 uint32_t ShiftLHS = 0; 2694 bool LHSFromHi = 0; 2695 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 2696 return SDValue(); 2697 2698 SDValue RHS; 2699 uint32_t ShiftRHS = 0; 2700 bool RHSFromHi = 0; 2701 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 2702 return SDValue(); 2703 2704 // If they're both trying to come from the high part of the register, they're 2705 // not really an EXTR. 2706 if (LHSFromHi == RHSFromHi) 2707 return SDValue(); 2708 2709 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 2710 return SDValue(); 2711 2712 if (LHSFromHi) { 2713 std::swap(LHS, RHS); 2714 std::swap(ShiftLHS, ShiftRHS); 2715 } 2716 2717 return DAG.getNode(AArch64ISD::EXTR, DL, VT, 2718 LHS, RHS, 2719 DAG.getConstant(ShiftRHS, MVT::i64)); 2720} 2721 2722/// Target-specific dag combine xforms for ISD::OR 2723static SDValue PerformORCombine(SDNode *N, 2724 TargetLowering::DAGCombinerInfo &DCI, 2725 const AArch64Subtarget *Subtarget) { 2726 2727 SelectionDAG &DAG = DCI.DAG; 2728 EVT VT = N->getValueType(0); 2729 2730 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 2731 return SDValue(); 2732 2733 // Attempt to recognise bitfield-insert operations. 2734 SDValue Res = tryCombineToBFI(N, DCI, Subtarget); 2735 if (Res.getNode()) 2736 return Res; 2737 2738 // Attempt to combine an existing MaskedBFI operation into one with a larger 2739 // mask. 2740 Res = tryCombineToLargerBFI(N, DCI, Subtarget); 2741 if (Res.getNode()) 2742 return Res; 2743 2744 Res = tryCombineToEXTR(N, DCI); 2745 if (Res.getNode()) 2746 return Res; 2747 2748 return SDValue(); 2749} 2750 2751/// Target-specific dag combine xforms for ISD::SRA 2752static SDValue PerformSRACombine(SDNode *N, 2753 TargetLowering::DAGCombinerInfo &DCI) { 2754 2755 SelectionDAG &DAG = DCI.DAG; 2756 SDLoc DL(N); 2757 EVT VT = N->getValueType(0); 2758 2759 // We're looking for an SRA/SHL pair which form an SBFX. 2760 2761 if (VT != MVT::i32 && VT != MVT::i64) 2762 return SDValue(); 2763 2764 if (!isa<ConstantSDNode>(N->getOperand(1))) 2765 return SDValue(); 2766 2767 uint64_t ExtraSignBits = N->getConstantOperandVal(1); 2768 SDValue Shift = N->getOperand(0); 2769 2770 if (Shift.getOpcode() != ISD::SHL) 2771 return SDValue(); 2772 2773 if (!isa<ConstantSDNode>(Shift->getOperand(1))) 2774 return SDValue(); 2775 2776 uint64_t BitsOnLeft = Shift->getConstantOperandVal(1); 2777 uint64_t Width = VT.getSizeInBits() - ExtraSignBits; 2778 uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft; 2779 2780 if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits()) 2781 return SDValue(); 2782 2783 return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0), 2784 DAG.getConstant(LSB, MVT::i64), 2785 DAG.getConstant(LSB + Width - 1, MVT::i64)); 2786} 2787 2788 2789SDValue 2790AArch64TargetLowering::PerformDAGCombine(SDNode *N, 2791 DAGCombinerInfo &DCI) const { 2792 switch (N->getOpcode()) { 2793 default: break; 2794 case ISD::AND: return PerformANDCombine(N, DCI); 2795 case ISD::OR: return PerformORCombine(N, DCI, getSubtarget()); 2796 case ISD::SRA: return PerformSRACombine(N, DCI); 2797 } 2798 return SDValue(); 2799} 2800 2801bool 2802AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 2803 VT = VT.getScalarType(); 2804 2805 if (!VT.isSimple()) 2806 return false; 2807 2808 switch (VT.getSimpleVT().SimpleTy) { 2809 case MVT::f16: 2810 case MVT::f32: 2811 case MVT::f64: 2812 return true; 2813 case MVT::f128: 2814 return false; 2815 default: 2816 break; 2817 } 2818 2819 return false; 2820} 2821 2822AArch64TargetLowering::ConstraintType 2823AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 2824 if (Constraint.size() == 1) { 2825 switch (Constraint[0]) { 2826 default: break; 2827 case 'w': // An FP/SIMD vector register 2828 return C_RegisterClass; 2829 case 'I': // Constant that can be used with an ADD instruction 2830 case 'J': // Constant that can be used with a SUB instruction 2831 case 'K': // Constant that can be used with a 32-bit logical instruction 2832 case 'L': // Constant that can be used with a 64-bit logical instruction 2833 case 'M': // Constant that can be used as a 32-bit MOV immediate 2834 case 'N': // Constant that can be used as a 64-bit MOV immediate 2835 case 'Y': // Floating point constant zero 2836 case 'Z': // Integer constant zero 2837 return C_Other; 2838 case 'Q': // A memory reference with base register and no offset 2839 return C_Memory; 2840 case 'S': // A symbolic address 2841 return C_Other; 2842 } 2843 } 2844 2845 // FIXME: Ump, Utf, Usa, Ush 2846 // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes, 2847 // whatever they may be 2848 // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be 2849 // Usa: An absolute symbolic address 2850 // Ush: The high part (bits 32:12) of a pc-relative symbolic address 2851 assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa" 2852 && Constraint != "Ush" && "Unimplemented constraints"); 2853 2854 return TargetLowering::getConstraintType(Constraint); 2855} 2856 2857TargetLowering::ConstraintWeight 2858AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info, 2859 const char *Constraint) const { 2860 2861 llvm_unreachable("Constraint weight unimplemented"); 2862} 2863 2864void 2865AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 2866 std::string &Constraint, 2867 std::vector<SDValue> &Ops, 2868 SelectionDAG &DAG) const { 2869 SDValue Result(0, 0); 2870 2871 // Only length 1 constraints are C_Other. 2872 if (Constraint.size() != 1) return; 2873 2874 // Only C_Other constraints get lowered like this. That means constants for us 2875 // so return early if there's no hope the constraint can be lowered. 2876 2877 switch(Constraint[0]) { 2878 default: break; 2879 case 'I': case 'J': case 'K': case 'L': 2880 case 'M': case 'N': case 'Z': { 2881 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 2882 if (!C) 2883 return; 2884 2885 uint64_t CVal = C->getZExtValue(); 2886 uint32_t Bits; 2887 2888 switch (Constraint[0]) { 2889 default: 2890 // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J' 2891 // is a peculiarly useless SUB constraint. 2892 llvm_unreachable("Unimplemented C_Other constraint"); 2893 case 'I': 2894 if (CVal <= 0xfff) 2895 break; 2896 return; 2897 case 'K': 2898 if (A64Imms::isLogicalImm(32, CVal, Bits)) 2899 break; 2900 return; 2901 case 'L': 2902 if (A64Imms::isLogicalImm(64, CVal, Bits)) 2903 break; 2904 return; 2905 case 'Z': 2906 if (CVal == 0) 2907 break; 2908 return; 2909 } 2910 2911 Result = DAG.getTargetConstant(CVal, Op.getValueType()); 2912 break; 2913 } 2914 case 'S': { 2915 // An absolute symbolic address or label reference. 2916 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) { 2917 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), 2918 GA->getValueType(0)); 2919 } else if (const BlockAddressSDNode *BA 2920 = dyn_cast<BlockAddressSDNode>(Op)) { 2921 Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), 2922 BA->getValueType(0)); 2923 } else if (const ExternalSymbolSDNode *ES 2924 = dyn_cast<ExternalSymbolSDNode>(Op)) { 2925 Result = DAG.getTargetExternalSymbol(ES->getSymbol(), 2926 ES->getValueType(0)); 2927 } else 2928 return; 2929 break; 2930 } 2931 case 'Y': 2932 if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 2933 if (CFP->isExactlyValue(0.0)) { 2934 Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0)); 2935 break; 2936 } 2937 } 2938 return; 2939 } 2940 2941 if (Result.getNode()) { 2942 Ops.push_back(Result); 2943 return; 2944 } 2945 2946 // It's an unknown constraint for us. Let generic code have a go. 2947 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2948} 2949 2950std::pair<unsigned, const TargetRegisterClass*> 2951AArch64TargetLowering::getRegForInlineAsmConstraint( 2952 const std::string &Constraint, 2953 MVT VT) const { 2954 if (Constraint.size() == 1) { 2955 switch (Constraint[0]) { 2956 case 'r': 2957 if (VT.getSizeInBits() <= 32) 2958 return std::make_pair(0U, &AArch64::GPR32RegClass); 2959 else if (VT == MVT::i64) 2960 return std::make_pair(0U, &AArch64::GPR64RegClass); 2961 break; 2962 case 'w': 2963 if (VT == MVT::f16) 2964 return std::make_pair(0U, &AArch64::FPR16RegClass); 2965 else if (VT == MVT::f32) 2966 return std::make_pair(0U, &AArch64::FPR32RegClass); 2967 else if (VT == MVT::f64) 2968 return std::make_pair(0U, &AArch64::FPR64RegClass); 2969 else if (VT.getSizeInBits() == 64) 2970 return std::make_pair(0U, &AArch64::VPR64RegClass); 2971 else if (VT == MVT::f128) 2972 return std::make_pair(0U, &AArch64::FPR128RegClass); 2973 else if (VT.getSizeInBits() == 128) 2974 return std::make_pair(0U, &AArch64::VPR128RegClass); 2975 break; 2976 } 2977 } 2978 2979 // Use the default implementation in TargetLowering to convert the register 2980 // constraint into a member of a register class. 2981 return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 2982} 2983