1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "R600Defines.h" 17#include "R600InstrInfo.h" 18#include "R600MachineFunctionInfo.h" 19#include "llvm/CodeGen/MachineFrameInfo.h" 20#include "llvm/CodeGen/MachineInstrBuilder.h" 21#include "llvm/CodeGen/MachineRegisterInfo.h" 22#include "llvm/CodeGen/SelectionDAG.h" 23#include "llvm/IR/Argument.h" 24#include "llvm/IR/Function.h" 25 26using namespace llvm; 27 28R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 29 AMDGPUTargetLowering(TM), 30 TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) { 31 setOperationAction(ISD::MUL, MVT::i64, Expand); 32 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 33 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 34 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 35 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 36 computeRegisterProperties(); 37 38 setOperationAction(ISD::FADD, MVT::v4f32, Expand); 39 setOperationAction(ISD::FMUL, MVT::v4f32, Expand); 40 setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 41 setOperationAction(ISD::FSUB, MVT::v4f32, Expand); 42 43 setOperationAction(ISD::ADD, MVT::v4i32, Expand); 44 setOperationAction(ISD::AND, MVT::v4i32, Expand); 45 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); 46 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); 47 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); 48 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); 49 setOperationAction(ISD::UDIV, MVT::v4i32, Expand); 50 setOperationAction(ISD::UREM, MVT::v4i32, Expand); 51 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 52 53 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 54 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 55 56 setOperationAction(ISD::FSUB, MVT::f32, Expand); 57 58 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 59 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 60 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 61 setOperationAction(ISD::FPOW, MVT::f32, Custom); 62 63 setOperationAction(ISD::ROTL, MVT::i32, Custom); 64 65 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 66 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 67 68 setOperationAction(ISD::SETCC, MVT::i32, Expand); 69 setOperationAction(ISD::SETCC, MVT::f32, Expand); 70 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 71 72 setOperationAction(ISD::SELECT, MVT::i32, Custom); 73 setOperationAction(ISD::SELECT, MVT::f32, Custom); 74 75 // Legalize loads and stores to the private address space. 76 setOperationAction(ISD::LOAD, MVT::i32, Custom); 77 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 78 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 79 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); 80 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 81 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 82 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); 83 setOperationAction(ISD::STORE, MVT::i8, Custom); 84 setOperationAction(ISD::STORE, MVT::i32, Custom); 85 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 86 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 87 88 setOperationAction(ISD::LOAD, MVT::i32, Custom); 89 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 90 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 91 92 setTargetDAGCombine(ISD::FP_ROUND); 93 setTargetDAGCombine(ISD::FP_TO_SINT); 94 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 95 setTargetDAGCombine(ISD::SELECT_CC); 96 97 setBooleanContents(ZeroOrNegativeOneBooleanContent); 98 setSchedulingPreference(Sched::VLIW); 99} 100 101MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 102 MachineInstr * MI, MachineBasicBlock * BB) const { 103 MachineFunction * MF = BB->getParent(); 104 MachineRegisterInfo &MRI = MF->getRegInfo(); 105 MachineBasicBlock::iterator I = *MI; 106 107 switch (MI->getOpcode()) { 108 default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 109 case AMDGPU::CLAMP_R600: { 110 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 111 AMDGPU::MOV, 112 MI->getOperand(0).getReg(), 113 MI->getOperand(1).getReg()); 114 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 115 break; 116 } 117 118 case AMDGPU::FABS_R600: { 119 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 120 AMDGPU::MOV, 121 MI->getOperand(0).getReg(), 122 MI->getOperand(1).getReg()); 123 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 124 break; 125 } 126 127 case AMDGPU::FNEG_R600: { 128 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 129 AMDGPU::MOV, 130 MI->getOperand(0).getReg(), 131 MI->getOperand(1).getReg()); 132 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 133 break; 134 } 135 136 case AMDGPU::MASK_WRITE: { 137 unsigned maskedRegister = MI->getOperand(0).getReg(); 138 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 139 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 140 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 141 break; 142 } 143 144 case AMDGPU::MOV_IMM_F32: 145 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 146 MI->getOperand(1).getFPImm()->getValueAPF() 147 .bitcastToAPInt().getZExtValue()); 148 break; 149 case AMDGPU::MOV_IMM_I32: 150 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 151 MI->getOperand(1).getImm()); 152 break; 153 case AMDGPU::CONST_COPY: { 154 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 155 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 156 TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, 157 MI->getOperand(1).getImm()); 158 break; 159 } 160 161 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 162 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 163 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 164 165 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 166 .addOperand(MI->getOperand(0)) 167 .addOperand(MI->getOperand(1)) 168 .addImm(EOP); // Set End of program bit 169 break; 170 } 171 172 case AMDGPU::TXD: { 173 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 174 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 175 176 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 177 .addOperand(MI->getOperand(3)) 178 .addOperand(MI->getOperand(4)) 179 .addOperand(MI->getOperand(5)) 180 .addOperand(MI->getOperand(6)); 181 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 182 .addOperand(MI->getOperand(2)) 183 .addOperand(MI->getOperand(4)) 184 .addOperand(MI->getOperand(5)) 185 .addOperand(MI->getOperand(6)); 186 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 187 .addOperand(MI->getOperand(0)) 188 .addOperand(MI->getOperand(1)) 189 .addOperand(MI->getOperand(4)) 190 .addOperand(MI->getOperand(5)) 191 .addOperand(MI->getOperand(6)) 192 .addReg(T0, RegState::Implicit) 193 .addReg(T1, RegState::Implicit); 194 break; 195 } 196 197 case AMDGPU::TXD_SHADOW: { 198 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 199 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 200 201 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 202 .addOperand(MI->getOperand(3)) 203 .addOperand(MI->getOperand(4)) 204 .addOperand(MI->getOperand(5)) 205 .addOperand(MI->getOperand(6)); 206 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 207 .addOperand(MI->getOperand(2)) 208 .addOperand(MI->getOperand(4)) 209 .addOperand(MI->getOperand(5)) 210 .addOperand(MI->getOperand(6)); 211 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 212 .addOperand(MI->getOperand(0)) 213 .addOperand(MI->getOperand(1)) 214 .addOperand(MI->getOperand(4)) 215 .addOperand(MI->getOperand(5)) 216 .addOperand(MI->getOperand(6)) 217 .addReg(T0, RegState::Implicit) 218 .addReg(T1, RegState::Implicit); 219 break; 220 } 221 222 case AMDGPU::BRANCH: 223 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 224 .addOperand(MI->getOperand(0)); 225 break; 226 227 case AMDGPU::BRANCH_COND_f32: { 228 MachineInstr *NewMI = 229 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 230 AMDGPU::PREDICATE_BIT) 231 .addOperand(MI->getOperand(1)) 232 .addImm(OPCODE_IS_NOT_ZERO) 233 .addImm(0); // Flags 234 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 235 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 236 .addOperand(MI->getOperand(0)) 237 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 238 break; 239 } 240 241 case AMDGPU::BRANCH_COND_i32: { 242 MachineInstr *NewMI = 243 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 244 AMDGPU::PREDICATE_BIT) 245 .addOperand(MI->getOperand(1)) 246 .addImm(OPCODE_IS_NOT_ZERO_INT) 247 .addImm(0); // Flags 248 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 249 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 250 .addOperand(MI->getOperand(0)) 251 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 252 break; 253 } 254 255 case AMDGPU::EG_ExportSwz: 256 case AMDGPU::R600_ExportSwz: { 257 // Instruction is left unmodified if its not the last one of its type 258 bool isLastInstructionOfItsType = true; 259 unsigned InstExportType = MI->getOperand(1).getImm(); 260 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 261 EndBlock = BB->end(); NextExportInst != EndBlock; 262 NextExportInst = llvm::next(NextExportInst)) { 263 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 264 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 265 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 266 .getImm(); 267 if (CurrentInstExportType == InstExportType) { 268 isLastInstructionOfItsType = false; 269 break; 270 } 271 } 272 } 273 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 274 if (!EOP && !isLastInstructionOfItsType) 275 return BB; 276 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 277 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 278 .addOperand(MI->getOperand(0)) 279 .addOperand(MI->getOperand(1)) 280 .addOperand(MI->getOperand(2)) 281 .addOperand(MI->getOperand(3)) 282 .addOperand(MI->getOperand(4)) 283 .addOperand(MI->getOperand(5)) 284 .addOperand(MI->getOperand(6)) 285 .addImm(CfInst) 286 .addImm(EOP); 287 break; 288 } 289 case AMDGPU::RETURN: { 290 // RETURN instructions must have the live-out registers as implicit uses, 291 // otherwise they appear dead. 292 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 293 MachineInstrBuilder MIB(*MF, MI); 294 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 295 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 296 return BB; 297 } 298 } 299 300 MI->eraseFromParent(); 301 return BB; 302} 303 304//===----------------------------------------------------------------------===// 305// Custom DAG Lowering Operations 306//===----------------------------------------------------------------------===// 307 308using namespace llvm::Intrinsic; 309using namespace llvm::AMDGPUIntrinsic; 310 311SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 312 switch (Op.getOpcode()) { 313 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 314 case ISD::ROTL: return LowerROTL(Op, DAG); 315 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 316 case ISD::SELECT: return LowerSELECT(Op, DAG); 317 case ISD::STORE: return LowerSTORE(Op, DAG); 318 case ISD::LOAD: return LowerLOAD(Op, DAG); 319 case ISD::FPOW: return LowerFPOW(Op, DAG); 320 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 321 case ISD::INTRINSIC_VOID: { 322 SDValue Chain = Op.getOperand(0); 323 unsigned IntrinsicID = 324 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 325 switch (IntrinsicID) { 326 case AMDGPUIntrinsic::AMDGPU_store_output: { 327 MachineFunction &MF = DAG.getMachineFunction(); 328 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 329 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 330 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 331 MFI->LiveOuts.push_back(Reg); 332 return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); 333 } 334 case AMDGPUIntrinsic::R600_store_swizzle: { 335 const SDValue Args[8] = { 336 Chain, 337 Op.getOperand(2), // Export Value 338 Op.getOperand(3), // ArrayBase 339 Op.getOperand(4), // Type 340 DAG.getConstant(0, MVT::i32), // SWZ_X 341 DAG.getConstant(1, MVT::i32), // SWZ_Y 342 DAG.getConstant(2, MVT::i32), // SWZ_Z 343 DAG.getConstant(3, MVT::i32) // SWZ_W 344 }; 345 return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(), 346 Args, 8); 347 } 348 349 // default for switch(IntrinsicID) 350 default: break; 351 } 352 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 353 break; 354 } 355 case ISD::INTRINSIC_WO_CHAIN: { 356 unsigned IntrinsicID = 357 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 358 EVT VT = Op.getValueType(); 359 DebugLoc DL = Op.getDebugLoc(); 360 switch(IntrinsicID) { 361 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 362 case AMDGPUIntrinsic::R600_load_input: { 363 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 364 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 365 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT); 366 } 367 368 case AMDGPUIntrinsic::R600_interp_input: { 369 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 370 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 371 MachineSDNode *interp; 372 if (ijb < 0) { 373 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 374 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 375 return DAG.getTargetExtractSubreg( 376 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 377 DL, MVT::f32, SDValue(interp, 0)); 378 } 379 380 if (slot % 4 < 2) 381 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 382 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 383 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 384 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), 385 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 386 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); 387 else 388 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 389 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 390 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 391 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), 392 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 393 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); 394 395 return SDValue(interp, slot % 2); 396 } 397 398 case r600_read_ngroups_x: 399 return LowerImplicitParameter(DAG, VT, DL, 0); 400 case r600_read_ngroups_y: 401 return LowerImplicitParameter(DAG, VT, DL, 1); 402 case r600_read_ngroups_z: 403 return LowerImplicitParameter(DAG, VT, DL, 2); 404 case r600_read_global_size_x: 405 return LowerImplicitParameter(DAG, VT, DL, 3); 406 case r600_read_global_size_y: 407 return LowerImplicitParameter(DAG, VT, DL, 4); 408 case r600_read_global_size_z: 409 return LowerImplicitParameter(DAG, VT, DL, 5); 410 case r600_read_local_size_x: 411 return LowerImplicitParameter(DAG, VT, DL, 6); 412 case r600_read_local_size_y: 413 return LowerImplicitParameter(DAG, VT, DL, 7); 414 case r600_read_local_size_z: 415 return LowerImplicitParameter(DAG, VT, DL, 8); 416 417 case r600_read_tgid_x: 418 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 419 AMDGPU::T1_X, VT); 420 case r600_read_tgid_y: 421 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 422 AMDGPU::T1_Y, VT); 423 case r600_read_tgid_z: 424 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 425 AMDGPU::T1_Z, VT); 426 case r600_read_tidig_x: 427 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 428 AMDGPU::T0_X, VT); 429 case r600_read_tidig_y: 430 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 431 AMDGPU::T0_Y, VT); 432 case r600_read_tidig_z: 433 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 434 AMDGPU::T0_Z, VT); 435 } 436 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 437 break; 438 } 439 } // end switch(Op.getOpcode()) 440 return SDValue(); 441} 442 443void R600TargetLowering::ReplaceNodeResults(SDNode *N, 444 SmallVectorImpl<SDValue> &Results, 445 SelectionDAG &DAG) const { 446 switch (N->getOpcode()) { 447 default: return; 448 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 449 return; 450 case ISD::LOAD: { 451 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 452 Results.push_back(SDValue(Node, 0)); 453 Results.push_back(SDValue(Node, 1)); 454 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 455 // function 456 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 457 return; 458 } 459 case ISD::STORE: 460 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 461 Results.push_back(SDValue(Node, 0)); 462 return; 463 } 464} 465 466SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 467 return DAG.getNode( 468 ISD::SETCC, 469 Op.getDebugLoc(), 470 MVT::i1, 471 Op, DAG.getConstantFP(0.0f, MVT::f32), 472 DAG.getCondCode(ISD::SETNE) 473 ); 474} 475 476SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 477 DebugLoc DL, 478 unsigned DwordOffset) const { 479 unsigned ByteOffset = DwordOffset * 4; 480 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 481 AMDGPUAS::PARAM_I_ADDRESS); 482 483 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 484 assert(isInt<16>(ByteOffset)); 485 486 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 487 DAG.getConstant(ByteOffset, MVT::i32), // PTR 488 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 489 false, false, false, 0); 490} 491 492SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 493 494 MachineFunction &MF = DAG.getMachineFunction(); 495 const AMDGPUFrameLowering *TFL = 496 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 497 498 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 499 assert(FIN); 500 501 unsigned FrameIndex = FIN->getIndex(); 502 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 503 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); 504} 505 506SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { 507 DebugLoc DL = Op.getDebugLoc(); 508 EVT VT = Op.getValueType(); 509 510 return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, 511 Op.getOperand(0), 512 Op.getOperand(0), 513 DAG.getNode(ISD::SUB, DL, VT, 514 DAG.getConstant(32, MVT::i32), 515 Op.getOperand(1))); 516} 517 518bool R600TargetLowering::isZero(SDValue Op) const { 519 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 520 return Cst->isNullValue(); 521 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 522 return CstFP->isZero(); 523 } else { 524 return false; 525 } 526} 527 528SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 529 DebugLoc DL = Op.getDebugLoc(); 530 EVT VT = Op.getValueType(); 531 532 SDValue LHS = Op.getOperand(0); 533 SDValue RHS = Op.getOperand(1); 534 SDValue True = Op.getOperand(2); 535 SDValue False = Op.getOperand(3); 536 SDValue CC = Op.getOperand(4); 537 SDValue Temp; 538 539 // LHS and RHS are guaranteed to be the same value type 540 EVT CompareVT = LHS.getValueType(); 541 542 // Check if we can lower this to a native operation. 543 544 // Try to lower to a SET* instruction: 545 // 546 // SET* can match the following patterns: 547 // 548 // select_cc f32, f32, -1, 0, cc_any 549 // select_cc f32, f32, 1.0f, 0.0f, cc_any 550 // select_cc i32, i32, -1, 0, cc_any 551 // 552 553 // Move hardware True/False values to the correct operand. 554 if (isHWTrueValue(False) && isHWFalseValue(True)) { 555 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 556 std::swap(False, True); 557 CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); 558 } 559 560 if (isHWTrueValue(True) && isHWFalseValue(False) && 561 (CompareVT == VT || VT == MVT::i32)) { 562 // This can be matched by a SET* instruction. 563 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 564 } 565 566 // Try to lower to a CND* instruction: 567 // 568 // CND* can match the following patterns: 569 // 570 // select_cc f32, 0.0, f32, f32, cc_any 571 // select_cc f32, 0.0, i32, i32, cc_any 572 // select_cc i32, 0, f32, f32, cc_any 573 // select_cc i32, 0, i32, i32, cc_any 574 // 575 if (isZero(LHS) || isZero(RHS)) { 576 SDValue Cond = (isZero(LHS) ? RHS : LHS); 577 SDValue Zero = (isZero(LHS) ? LHS : RHS); 578 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 579 if (CompareVT != VT) { 580 // Bitcast True / False to the correct types. This will end up being 581 // a nop, but it allows us to define only a single pattern in the 582 // .TD files for each CND* instruction rather than having to have 583 // one pattern for integer True/False and one for fp True/False 584 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 585 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 586 } 587 if (isZero(LHS)) { 588 CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); 589 } 590 591 switch (CCOpcode) { 592 case ISD::SETONE: 593 case ISD::SETUNE: 594 case ISD::SETNE: 595 case ISD::SETULE: 596 case ISD::SETULT: 597 case ISD::SETOLE: 598 case ISD::SETOLT: 599 case ISD::SETLE: 600 case ISD::SETLT: 601 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 602 Temp = True; 603 True = False; 604 False = Temp; 605 break; 606 default: 607 break; 608 } 609 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 610 Cond, Zero, 611 True, False, 612 DAG.getCondCode(CCOpcode)); 613 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 614 } 615 616 617 // Possible Min/Max pattern 618 SDValue MinMax = LowerMinMax(Op, DAG); 619 if (MinMax.getNode()) { 620 return MinMax; 621 } 622 623 // If we make it this for it means we have no native instructions to handle 624 // this SELECT_CC, so we must lower it. 625 SDValue HWTrue, HWFalse; 626 627 if (CompareVT == MVT::f32) { 628 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 629 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 630 } else if (CompareVT == MVT::i32) { 631 HWTrue = DAG.getConstant(-1, CompareVT); 632 HWFalse = DAG.getConstant(0, CompareVT); 633 } 634 else { 635 assert(!"Unhandled value type in LowerSELECT_CC"); 636 } 637 638 // Lower this unsupported SELECT_CC into a combination of two supported 639 // SELECT_CC operations. 640 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 641 642 return DAG.getNode(ISD::SELECT_CC, DL, VT, 643 Cond, HWFalse, 644 True, False, 645 DAG.getCondCode(ISD::SETNE)); 646} 647 648SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 649 return DAG.getNode(ISD::SELECT_CC, 650 Op.getDebugLoc(), 651 Op.getValueType(), 652 Op.getOperand(0), 653 DAG.getConstant(0, MVT::i32), 654 Op.getOperand(1), 655 Op.getOperand(2), 656 DAG.getCondCode(ISD::SETNE)); 657} 658 659/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 660/// convert these pointers to a register index. Each register holds 661/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 662/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 663/// for indirect addressing. 664SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 665 unsigned StackWidth, 666 SelectionDAG &DAG) const { 667 unsigned SRLPad; 668 switch(StackWidth) { 669 case 1: 670 SRLPad = 2; 671 break; 672 case 2: 673 SRLPad = 3; 674 break; 675 case 4: 676 SRLPad = 4; 677 break; 678 default: llvm_unreachable("Invalid stack width"); 679 } 680 681 return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, 682 DAG.getConstant(SRLPad, MVT::i32)); 683} 684 685void R600TargetLowering::getStackAddress(unsigned StackWidth, 686 unsigned ElemIdx, 687 unsigned &Channel, 688 unsigned &PtrIncr) const { 689 switch (StackWidth) { 690 default: 691 case 1: 692 Channel = 0; 693 if (ElemIdx > 0) { 694 PtrIncr = 1; 695 } else { 696 PtrIncr = 0; 697 } 698 break; 699 case 2: 700 Channel = ElemIdx % 2; 701 if (ElemIdx == 2) { 702 PtrIncr = 1; 703 } else { 704 PtrIncr = 0; 705 } 706 break; 707 case 4: 708 Channel = ElemIdx; 709 PtrIncr = 0; 710 break; 711 } 712} 713 714SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 715 DebugLoc DL = Op.getDebugLoc(); 716 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 717 SDValue Chain = Op.getOperand(0); 718 SDValue Value = Op.getOperand(1); 719 SDValue Ptr = Op.getOperand(2); 720 721 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 722 Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { 723 // Convert pointer from byte address to dword address. 724 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 725 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 726 Ptr, DAG.getConstant(2, MVT::i32))); 727 728 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 729 assert(!"Truncated and indexed stores not supported yet"); 730 } else { 731 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 732 } 733 return Chain; 734 } 735 736 EVT ValueVT = Value.getValueType(); 737 738 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 739 return SDValue(); 740 } 741 742 // Lowering for indirect addressing 743 744 const MachineFunction &MF = DAG.getMachineFunction(); 745 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 746 getTargetMachine().getFrameLowering()); 747 unsigned StackWidth = TFL->getStackWidth(MF); 748 749 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 750 751 if (ValueVT.isVector()) { 752 unsigned NumElemVT = ValueVT.getVectorNumElements(); 753 EVT ElemVT = ValueVT.getVectorElementType(); 754 SDValue Stores[4]; 755 756 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 757 "vector width in load"); 758 759 for (unsigned i = 0; i < NumElemVT; ++i) { 760 unsigned Channel, PtrIncr; 761 getStackAddress(StackWidth, i, Channel, PtrIncr); 762 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 763 DAG.getConstant(PtrIncr, MVT::i32)); 764 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 765 Value, DAG.getConstant(i, MVT::i32)); 766 767 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 768 Chain, Elem, Ptr, 769 DAG.getTargetConstant(Channel, MVT::i32)); 770 } 771 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 772 } else { 773 if (ValueVT == MVT::i8) { 774 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 775 } 776 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 777 DAG.getTargetConstant(0, MVT::i32)); // Channel 778 } 779 780 return Chain; 781} 782 783// return (512 + (kc_bank << 12) 784static int 785ConstantAddressBlock(unsigned AddressSpace) { 786 switch (AddressSpace) { 787 case AMDGPUAS::CONSTANT_BUFFER_0: 788 return 512; 789 case AMDGPUAS::CONSTANT_BUFFER_1: 790 return 512 + 4096; 791 case AMDGPUAS::CONSTANT_BUFFER_2: 792 return 512 + 4096 * 2; 793 case AMDGPUAS::CONSTANT_BUFFER_3: 794 return 512 + 4096 * 3; 795 case AMDGPUAS::CONSTANT_BUFFER_4: 796 return 512 + 4096 * 4; 797 case AMDGPUAS::CONSTANT_BUFFER_5: 798 return 512 + 4096 * 5; 799 case AMDGPUAS::CONSTANT_BUFFER_6: 800 return 512 + 4096 * 6; 801 case AMDGPUAS::CONSTANT_BUFFER_7: 802 return 512 + 4096 * 7; 803 case AMDGPUAS::CONSTANT_BUFFER_8: 804 return 512 + 4096 * 8; 805 case AMDGPUAS::CONSTANT_BUFFER_9: 806 return 512 + 4096 * 9; 807 case AMDGPUAS::CONSTANT_BUFFER_10: 808 return 512 + 4096 * 10; 809 case AMDGPUAS::CONSTANT_BUFFER_11: 810 return 512 + 4096 * 11; 811 case AMDGPUAS::CONSTANT_BUFFER_12: 812 return 512 + 4096 * 12; 813 case AMDGPUAS::CONSTANT_BUFFER_13: 814 return 512 + 4096 * 13; 815 case AMDGPUAS::CONSTANT_BUFFER_14: 816 return 512 + 4096 * 14; 817 case AMDGPUAS::CONSTANT_BUFFER_15: 818 return 512 + 4096 * 15; 819 default: 820 return -1; 821 } 822} 823 824SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 825{ 826 EVT VT = Op.getValueType(); 827 DebugLoc DL = Op.getDebugLoc(); 828 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 829 SDValue Chain = Op.getOperand(0); 830 SDValue Ptr = Op.getOperand(1); 831 SDValue LoweredLoad; 832 833 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 834 if (ConstantBlock > -1) { 835 SDValue Result; 836 if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || 837 dyn_cast<Constant>(LoadNode->getSrcValue()) || 838 dyn_cast<ConstantSDNode>(Ptr)) { 839 SDValue Slots[4]; 840 for (unsigned i = 0; i < 4; i++) { 841 // We want Const position encoded with the following formula : 842 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 843 // const_index is Ptr computed by llvm using an alignment of 16. 844 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 845 // then div by 4 at the ISel step 846 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 847 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 848 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 849 } 850 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); 851 } else { 852 // non constant ptr cant be folded, keeps it as a v4f32 load 853 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 854 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 855 DAG.getConstant(LoadNode->getAddressSpace() - 856 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 857 ); 858 } 859 860 if (!VT.isVector()) { 861 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 862 DAG.getConstant(0, MVT::i32)); 863 } 864 865 SDValue MergedValues[2] = { 866 Result, 867 Chain 868 }; 869 return DAG.getMergeValues(MergedValues, 2, DL); 870 } 871 872 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 873 return SDValue(); 874 } 875 876 // Lowering for indirect addressing 877 const MachineFunction &MF = DAG.getMachineFunction(); 878 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 879 getTargetMachine().getFrameLowering()); 880 unsigned StackWidth = TFL->getStackWidth(MF); 881 882 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 883 884 if (VT.isVector()) { 885 unsigned NumElemVT = VT.getVectorNumElements(); 886 EVT ElemVT = VT.getVectorElementType(); 887 SDValue Loads[4]; 888 889 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 890 "vector width in load"); 891 892 for (unsigned i = 0; i < NumElemVT; ++i) { 893 unsigned Channel, PtrIncr; 894 getStackAddress(StackWidth, i, Channel, PtrIncr); 895 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 896 DAG.getConstant(PtrIncr, MVT::i32)); 897 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 898 Chain, Ptr, 899 DAG.getTargetConstant(Channel, MVT::i32), 900 Op.getOperand(2)); 901 } 902 for (unsigned i = NumElemVT; i < 4; ++i) { 903 Loads[i] = DAG.getUNDEF(ElemVT); 904 } 905 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 906 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 907 } else { 908 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 909 Chain, Ptr, 910 DAG.getTargetConstant(0, MVT::i32), // Channel 911 Op.getOperand(2)); 912 } 913 914 SDValue Ops[2]; 915 Ops[0] = LoweredLoad; 916 Ops[1] = Chain; 917 918 return DAG.getMergeValues(Ops, 2, DL); 919} 920 921SDValue R600TargetLowering::LowerFPOW(SDValue Op, 922 SelectionDAG &DAG) const { 923 DebugLoc DL = Op.getDebugLoc(); 924 EVT VT = Op.getValueType(); 925 SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0)); 926 SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase); 927 return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase); 928} 929 930/// XXX Only kernel functions are supported, so we can assume for now that 931/// every function is a kernel function, but in the future we should use 932/// separate calling conventions for kernel and non-kernel functions. 933SDValue R600TargetLowering::LowerFormalArguments( 934 SDValue Chain, 935 CallingConv::ID CallConv, 936 bool isVarArg, 937 const SmallVectorImpl<ISD::InputArg> &Ins, 938 DebugLoc DL, SelectionDAG &DAG, 939 SmallVectorImpl<SDValue> &InVals) const { 940 unsigned ParamOffsetBytes = 36; 941 Function::const_arg_iterator FuncArg = 942 DAG.getMachineFunction().getFunction()->arg_begin(); 943 for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { 944 EVT VT = Ins[i].VT; 945 Type *ArgType = FuncArg->getType(); 946 unsigned ArgSizeInBits = ArgType->isPointerTy() ? 947 32 : ArgType->getPrimitiveSizeInBits(); 948 unsigned ArgBytes = ArgSizeInBits >> 3; 949 EVT ArgVT; 950 if (ArgSizeInBits < VT.getSizeInBits()) { 951 assert(!ArgType->isFloatTy() && 952 "Extending floating point arguments not supported yet"); 953 ArgVT = MVT::getIntegerVT(ArgSizeInBits); 954 } else { 955 ArgVT = VT; 956 } 957 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 958 AMDGPUAS::PARAM_I_ADDRESS); 959 SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), 960 DAG.getConstant(ParamOffsetBytes, MVT::i32), 961 MachinePointerInfo(UndefValue::get(PtrTy)), 962 ArgVT, false, false, ArgBytes); 963 InVals.push_back(Arg); 964 ParamOffsetBytes += ArgBytes; 965 } 966 return Chain; 967} 968 969EVT R600TargetLowering::getSetCCResultType(EVT VT) const { 970 if (!VT.isVector()) return MVT::i32; 971 return VT.changeVectorElementTypeToInteger(); 972} 973 974//===----------------------------------------------------------------------===// 975// Custom DAG Optimizations 976//===----------------------------------------------------------------------===// 977 978SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 979 DAGCombinerInfo &DCI) const { 980 SelectionDAG &DAG = DCI.DAG; 981 982 switch (N->getOpcode()) { 983 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 984 case ISD::FP_ROUND: { 985 SDValue Arg = N->getOperand(0); 986 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 987 return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0), 988 Arg.getOperand(0)); 989 } 990 break; 991 } 992 993 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 994 // (i32 select_cc f32, f32, -1, 0 cc) 995 // 996 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 997 // this to one of the SET*_DX10 instructions. 998 case ISD::FP_TO_SINT: { 999 SDValue FNeg = N->getOperand(0); 1000 if (FNeg.getOpcode() != ISD::FNEG) { 1001 return SDValue(); 1002 } 1003 SDValue SelectCC = FNeg.getOperand(0); 1004 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1005 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1006 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1007 !isHWTrueValue(SelectCC.getOperand(2)) || 1008 !isHWFalseValue(SelectCC.getOperand(3))) { 1009 return SDValue(); 1010 } 1011 1012 return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0), 1013 SelectCC.getOperand(0), // LHS 1014 SelectCC.getOperand(1), // RHS 1015 DAG.getConstant(-1, MVT::i32), // True 1016 DAG.getConstant(0, MVT::i32), // Flase 1017 SelectCC.getOperand(4)); // CC 1018 1019 break; 1020 } 1021 // Extract_vec (Build_vector) generated by custom lowering 1022 // also needs to be customly combined 1023 case ISD::EXTRACT_VECTOR_ELT: { 1024 SDValue Arg = N->getOperand(0); 1025 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1026 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1027 unsigned Element = Const->getZExtValue(); 1028 return Arg->getOperand(Element); 1029 } 1030 } 1031 if (Arg.getOpcode() == ISD::BITCAST && 1032 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1033 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1034 unsigned Element = Const->getZExtValue(); 1035 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(), 1036 Arg->getOperand(0).getOperand(Element)); 1037 } 1038 } 1039 } 1040 1041 case ISD::SELECT_CC: { 1042 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1043 // selectcc x, y, a, b, inv(cc) 1044 // 1045 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1046 // selectcc x, y, a, b, cc 1047 SDValue LHS = N->getOperand(0); 1048 if (LHS.getOpcode() != ISD::SELECT_CC) { 1049 return SDValue(); 1050 } 1051 1052 SDValue RHS = N->getOperand(1); 1053 SDValue True = N->getOperand(2); 1054 SDValue False = N->getOperand(3); 1055 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1056 1057 if (LHS.getOperand(2).getNode() != True.getNode() || 1058 LHS.getOperand(3).getNode() != False.getNode() || 1059 RHS.getNode() != False.getNode()) { 1060 return SDValue(); 1061 } 1062 1063 switch (NCC) { 1064 default: return SDValue(); 1065 case ISD::SETNE: return LHS; 1066 case ISD::SETEQ: { 1067 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1068 LHSCC = ISD::getSetCCInverse(LHSCC, 1069 LHS.getOperand(0).getValueType().isInteger()); 1070 return DAG.getSelectCC(N->getDebugLoc(), 1071 LHS.getOperand(0), 1072 LHS.getOperand(1), 1073 LHS.getOperand(2), 1074 LHS.getOperand(3), 1075 LHSCC); 1076 } 1077 } 1078 } 1079 case AMDGPUISD::EXPORT: { 1080 SDValue Arg = N->getOperand(1); 1081 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1082 break; 1083 SDValue NewBldVec[4] = { 1084 DAG.getUNDEF(MVT::f32), 1085 DAG.getUNDEF(MVT::f32), 1086 DAG.getUNDEF(MVT::f32), 1087 DAG.getUNDEF(MVT::f32) 1088 }; 1089 SDValue NewArgs[8] = { 1090 N->getOperand(0), // Chain 1091 SDValue(), 1092 N->getOperand(2), // ArrayBase 1093 N->getOperand(3), // Type 1094 N->getOperand(4), // SWZ_X 1095 N->getOperand(5), // SWZ_Y 1096 N->getOperand(6), // SWZ_Z 1097 N->getOperand(7) // SWZ_W 1098 }; 1099 for (unsigned i = 0; i < Arg.getNumOperands(); i++) { 1100 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) { 1101 if (C->isZero()) { 1102 NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 1103 } else if (C->isExactlyValue(1.0)) { 1104 NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 1105 } else { 1106 NewBldVec[i] = Arg.getOperand(i); 1107 } 1108 } else { 1109 NewBldVec[i] = Arg.getOperand(i); 1110 } 1111 } 1112 DebugLoc DL = N->getDebugLoc(); 1113 NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); 1114 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1115 } 1116 } 1117 return SDValue(); 1118} 1119