X86ISelLowering.cpp revision 15d03fb7f496562d7256ecac69f63e08ee0bfd2e
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "Utils/X86ShuffleDecode.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/IntrinsicLowering.h" 32#include "llvm/CodeGen/MachineFrameInfo.h" 33#include "llvm/CodeGen/MachineFunction.h" 34#include "llvm/CodeGen/MachineInstrBuilder.h" 35#include "llvm/CodeGen/MachineJumpTableInfo.h" 36#include "llvm/CodeGen/MachineModuleInfo.h" 37#include "llvm/CodeGen/MachineRegisterInfo.h" 38#include "llvm/CodeGen/PseudoSourceValue.h" 39#include "llvm/MC/MCAsmInfo.h" 40#include "llvm/MC/MCContext.h" 41#include "llvm/MC/MCExpr.h" 42#include "llvm/MC/MCSymbol.h" 43#include "llvm/ADT/BitVector.h" 44#include "llvm/ADT/SmallSet.h" 45#include "llvm/ADT/Statistic.h" 46#include "llvm/ADT/StringExtras.h" 47#include "llvm/ADT/VectorExtras.h" 48#include "llvm/Support/CallSite.h" 49#include "llvm/Support/Debug.h" 50#include "llvm/Support/Dwarf.h" 51#include "llvm/Support/ErrorHandling.h" 52#include "llvm/Support/MathExtras.h" 53#include "llvm/Support/raw_ostream.h" 54using namespace llvm; 55using namespace dwarf; 56 57STATISTIC(NumTailCalls, "Number of tail calls"); 58 59// Forward declarations. 60static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 61 SDValue V2); 62 63static SDValue Insert128BitVector(SDValue Result, 64 SDValue Vec, 65 SDValue Idx, 66 SelectionDAG &DAG, 67 DebugLoc dl); 68 69static SDValue Extract128BitVector(SDValue Vec, 70 SDValue Idx, 71 SelectionDAG &DAG, 72 DebugLoc dl); 73 74/// Generate a DAG to grab 128-bits from a vector > 128 bits. This 75/// sets things up to match to an AVX VEXTRACTF128 instruction or a 76/// simple subregister reference. Idx is an index in the 128 bits we 77/// want. It need not be aligned to a 128-bit bounday. That makes 78/// lowering EXTRACT_VECTOR_ELT operations easier. 79static SDValue Extract128BitVector(SDValue Vec, 80 SDValue Idx, 81 SelectionDAG &DAG, 82 DebugLoc dl) { 83 EVT VT = Vec.getValueType(); 84 assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); 85 EVT ElVT = VT.getVectorElementType(); 86 int Factor = VT.getSizeInBits()/128; 87 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, 88 VT.getVectorNumElements()/Factor); 89 90 // Extract from UNDEF is UNDEF. 91 if (Vec.getOpcode() == ISD::UNDEF) 92 return DAG.getNode(ISD::UNDEF, dl, ResultVT); 93 94 if (isa<ConstantSDNode>(Idx)) { 95 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 96 97 // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR 98 // we can match to VEXTRACTF128. 99 unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); 100 101 // This is the index of the first element of the 128-bit chunk 102 // we want. 103 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) 104 * ElemsPerChunk); 105 106 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 107 SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, 108 VecIdx); 109 110 return Result; 111 } 112 113 return SDValue(); 114} 115 116/// Generate a DAG to put 128-bits into a vector > 128 bits. This 117/// sets things up to match to an AVX VINSERTF128 instruction or a 118/// simple superregister reference. Idx is an index in the 128 bits 119/// we want. It need not be aligned to a 128-bit bounday. That makes 120/// lowering INSERT_VECTOR_ELT operations easier. 121static SDValue Insert128BitVector(SDValue Result, 122 SDValue Vec, 123 SDValue Idx, 124 SelectionDAG &DAG, 125 DebugLoc dl) { 126 if (isa<ConstantSDNode>(Idx)) { 127 EVT VT = Vec.getValueType(); 128 assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); 129 130 EVT ElVT = VT.getVectorElementType(); 131 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 132 EVT ResultVT = Result.getValueType(); 133 134 // Insert the relevant 128 bits. 135 unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); 136 137 // This is the index of the first element of the 128-bit chunk 138 // we want. 139 unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) 140 * ElemsPerChunk); 141 142 SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); 143 Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, 144 VecIdx); 145 return Result; 146 } 147 148 return SDValue(); 149} 150 151static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 152 const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); 153 bool is64Bit = Subtarget->is64Bit(); 154 155 if (Subtarget->isTargetEnvMacho()) { 156 if (is64Bit) 157 return new X8664_MachoTargetObjectFile(); 158 return new TargetLoweringObjectFileMachO(); 159 } 160 161 if (Subtarget->isTargetELF()) 162 return new TargetLoweringObjectFileELF(); 163 if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) 164 return new TargetLoweringObjectFileCOFF(); 165 llvm_unreachable("unknown subtarget type"); 166} 167 168X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 169 : TargetLowering(TM, createTLOF(TM)) { 170 Subtarget = &TM.getSubtarget<X86Subtarget>(); 171 X86ScalarSSEf64 = Subtarget->hasXMMInt(); 172 X86ScalarSSEf32 = Subtarget->hasXMM(); 173 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 174 175 RegInfo = TM.getRegisterInfo(); 176 TD = getTargetData(); 177 178 // Set up the TargetLowering object. 179 static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; 180 181 // X86 is weird, it always uses i8 for shift amounts and setcc results. 182 setBooleanContents(ZeroOrOneBooleanContent); 183 184 // For 64-bit since we have so many registers use the ILP scheduler, for 185 // 32-bit code use the register pressure specific scheduling. 186 if (Subtarget->is64Bit()) 187 setSchedulingPreference(Sched::ILP); 188 else 189 setSchedulingPreference(Sched::RegPressure); 190 setStackPointerRegisterToSaveRestore(X86StackPtr); 191 192 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 193 // Setup Windows compiler runtime calls. 194 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 195 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 196 setLibcallName(RTLIB::SREM_I64, "_allrem"); 197 setLibcallName(RTLIB::UREM_I64, "_aullrem"); 198 setLibcallName(RTLIB::MUL_I64, "_allmul"); 199 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 200 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 201 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 202 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 203 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); 204 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); 205 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); 206 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 207 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 208 } 209 210 if (Subtarget->isTargetDarwin()) { 211 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 212 setUseUnderscoreSetJmp(false); 213 setUseUnderscoreLongJmp(false); 214 } else if (Subtarget->isTargetMingw()) { 215 // MS runtime is weird: it exports _setjmp, but longjmp! 216 setUseUnderscoreSetJmp(true); 217 setUseUnderscoreLongJmp(false); 218 } else { 219 setUseUnderscoreSetJmp(true); 220 setUseUnderscoreLongJmp(true); 221 } 222 223 // Set up the register classes. 224 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 225 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 226 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 227 if (Subtarget->is64Bit()) 228 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 229 230 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 231 232 // We don't accept any truncstore of integer registers. 233 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 234 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 235 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 236 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 237 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 238 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 239 240 // SETOEQ and SETUNE require checking two conditions. 241 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 242 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 243 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 244 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 245 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 246 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 247 248 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 249 // operation. 250 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 251 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 252 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 253 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 256 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 257 } else if (!UseSoftFloat) { 258 // We have an algorithm for SSE2->double, and we turn this into a 259 // 64-bit FILD followed by conditional FADD for other targets. 260 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 261 // We have an algorithm for SSE2, and we turn this into a 64-bit 262 // FILD for other targets. 263 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 264 } 265 266 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 267 // this operation. 268 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 269 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 270 271 if (!UseSoftFloat) { 272 // SSE has no i16 to fp conversion, only i32 273 if (X86ScalarSSEf32) { 274 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 275 // f32 and f64 cases are Legal, f80 case is not 276 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 277 } else { 278 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 279 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 280 } 281 } else { 282 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 283 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 284 } 285 286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 287 // are Legal, f80 is custom lowered. 288 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 289 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 290 291 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 292 // this operation. 293 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 294 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 295 296 if (X86ScalarSSEf32) { 297 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 298 // f32 and f64 cases are Legal, f80 case is not 299 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 300 } else { 301 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 302 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 303 } 304 305 // Handle FP_TO_UINT by promoting the destination to a larger signed 306 // conversion. 307 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 308 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 309 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 310 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 313 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 314 } else if (!UseSoftFloat) { 315 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 316 // Expand FP_TO_UINT into a select. 317 // FIXME: We would like to use a Custom expander here eventually to do 318 // the optimal thing for SSE vs. the default expansion in the legalizer. 319 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 320 else 321 // With SSE3 we can use fisttpll to convert to a signed i64; without 322 // SSE, we're stuck with a fistpll. 323 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 324 } 325 326 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 327 if (!X86ScalarSSEf64) { 328 setOperationAction(ISD::BITCAST , MVT::f32 , Expand); 329 setOperationAction(ISD::BITCAST , MVT::i32 , Expand); 330 if (Subtarget->is64Bit()) { 331 setOperationAction(ISD::BITCAST , MVT::f64 , Expand); 332 // Without SSE, i64->f64 goes through memory. 333 setOperationAction(ISD::BITCAST , MVT::i64 , Expand); 334 } 335 } 336 337 // Scalar integer divide and remainder are lowered to use operations that 338 // produce two results, to match the available instructions. This exposes 339 // the two-result form to trivial CSE, which is able to combine x/y and x%y 340 // into a single instruction. 341 // 342 // Scalar integer multiply-high is also lowered to use two-result 343 // operations, to match the available instructions. However, plain multiply 344 // (low) operations are left as Legal, as there are single-result 345 // instructions for this in x86. Using the two-result multiply instructions 346 // when both high and low results are needed must be arranged by dagcombine. 347 for (unsigned i = 0, e = 4; i != e; ++i) { 348 MVT VT = IntVTs[i]; 349 setOperationAction(ISD::MULHS, VT, Expand); 350 setOperationAction(ISD::MULHU, VT, Expand); 351 setOperationAction(ISD::SDIV, VT, Expand); 352 setOperationAction(ISD::UDIV, VT, Expand); 353 setOperationAction(ISD::SREM, VT, Expand); 354 setOperationAction(ISD::UREM, VT, Expand); 355 356 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. 357 setOperationAction(ISD::ADDC, VT, Custom); 358 setOperationAction(ISD::ADDE, VT, Custom); 359 setOperationAction(ISD::SUBC, VT, Custom); 360 setOperationAction(ISD::SUBE, VT, Custom); 361 } 362 363 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 364 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 365 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 366 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 367 if (Subtarget->is64Bit()) 368 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 369 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 370 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 371 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 372 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 373 setOperationAction(ISD::FREM , MVT::f32 , Expand); 374 setOperationAction(ISD::FREM , MVT::f64 , Expand); 375 setOperationAction(ISD::FREM , MVT::f80 , Expand); 376 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 377 378 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 379 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 380 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 381 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 382 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 383 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 384 if (Subtarget->is64Bit()) { 385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 386 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 387 } 388 389 if (Subtarget->hasPOPCNT()) { 390 setOperationAction(ISD::CTPOP , MVT::i8 , Promote); 391 } else { 392 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 393 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 394 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 395 if (Subtarget->is64Bit()) 396 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 397 } 398 399 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 400 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 401 402 // These should be promoted to a larger select which is supported. 403 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 404 // X86 wants to expand cmov itself. 405 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 406 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 407 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 408 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 409 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 410 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 411 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 412 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 413 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 414 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 415 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 416 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 417 if (Subtarget->is64Bit()) { 418 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 419 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 420 } 421 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 422 423 // Darwin ABI issue. 424 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 425 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 426 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 427 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 430 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 431 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 432 if (Subtarget->is64Bit()) { 433 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 434 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 435 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 436 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 437 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 438 } 439 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 440 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 441 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 442 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 443 if (Subtarget->is64Bit()) { 444 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 445 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 446 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 447 } 448 449 if (Subtarget->hasXMM()) 450 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 451 452 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 453 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); 454 455 // On X86 and X86-64, atomic operations are lowered to locked instructions. 456 // Locked instructions, in turn, have implicit fence semantics (all memory 457 // operations are flushed before issuing the locked instruction, and they 458 // are not buffered), so we can fold away the common pattern of 459 // fence-atomic-fence. 460 setShouldFoldAtomicFences(true); 461 462 // Expand certain atomics 463 for (unsigned i = 0, e = 4; i != e; ++i) { 464 MVT VT = IntVTs[i]; 465 setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); 466 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); 467 } 468 469 if (!Subtarget->is64Bit()) { 470 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 471 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 472 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 473 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 474 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 475 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 476 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 477 } 478 479 // FIXME - use subtarget debug flags 480 if (!Subtarget->isTargetDarwin() && 481 !Subtarget->isTargetELF() && 482 !Subtarget->isTargetCygMing()) { 483 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 484 } 485 486 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 487 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 488 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 489 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 490 if (Subtarget->is64Bit()) { 491 setExceptionPointerRegister(X86::RAX); 492 setExceptionSelectorRegister(X86::RDX); 493 } else { 494 setExceptionPointerRegister(X86::EAX); 495 setExceptionSelectorRegister(X86::EDX); 496 } 497 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 498 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 499 500 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 501 502 setOperationAction(ISD::TRAP, MVT::Other, Legal); 503 504 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 505 setOperationAction(ISD::VASTART , MVT::Other, Custom); 506 setOperationAction(ISD::VAEND , MVT::Other, Expand); 507 if (Subtarget->is64Bit()) { 508 setOperationAction(ISD::VAARG , MVT::Other, Custom); 509 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 510 } else { 511 setOperationAction(ISD::VAARG , MVT::Other, Expand); 512 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 513 } 514 515 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 516 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 517 setOperationAction(ISD::DYNAMIC_STACKALLOC, 518 (Subtarget->is64Bit() ? MVT::i64 : MVT::i32), 519 (Subtarget->isTargetCOFF() 520 && !Subtarget->isTargetEnvMacho() 521 ? Custom : Expand)); 522 523 if (!UseSoftFloat && X86ScalarSSEf64) { 524 // f32 and f64 use SSE. 525 // Set up the FP register classes. 526 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 527 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 528 529 // Use ANDPD to simulate FABS. 530 setOperationAction(ISD::FABS , MVT::f64, Custom); 531 setOperationAction(ISD::FABS , MVT::f32, Custom); 532 533 // Use XORP to simulate FNEG. 534 setOperationAction(ISD::FNEG , MVT::f64, Custom); 535 setOperationAction(ISD::FNEG , MVT::f32, Custom); 536 537 // Use ANDPD and ORPD to simulate FCOPYSIGN. 538 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 539 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 540 541 // Lower this to FGETSIGNx86 plus an AND. 542 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); 543 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); 544 545 // We don't support sin/cos/fmod 546 setOperationAction(ISD::FSIN , MVT::f64, Expand); 547 setOperationAction(ISD::FCOS , MVT::f64, Expand); 548 setOperationAction(ISD::FSIN , MVT::f32, Expand); 549 setOperationAction(ISD::FCOS , MVT::f32, Expand); 550 551 // Expand FP immediates into loads from the stack, except for the special 552 // cases we handle. 553 addLegalFPImmediate(APFloat(+0.0)); // xorpd 554 addLegalFPImmediate(APFloat(+0.0f)); // xorps 555 } else if (!UseSoftFloat && X86ScalarSSEf32) { 556 // Use SSE for f32, x87 for f64. 557 // Set up the FP register classes. 558 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 559 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 560 561 // Use ANDPS to simulate FABS. 562 setOperationAction(ISD::FABS , MVT::f32, Custom); 563 564 // Use XORP to simulate FNEG. 565 setOperationAction(ISD::FNEG , MVT::f32, Custom); 566 567 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 568 569 // Use ANDPS and ORPS to simulate FCOPYSIGN. 570 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 571 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 572 573 // We don't support sin/cos/fmod 574 setOperationAction(ISD::FSIN , MVT::f32, Expand); 575 setOperationAction(ISD::FCOS , MVT::f32, Expand); 576 577 // Special cases we handle for FP constants. 578 addLegalFPImmediate(APFloat(+0.0f)); // xorps 579 addLegalFPImmediate(APFloat(+0.0)); // FLD0 580 addLegalFPImmediate(APFloat(+1.0)); // FLD1 581 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 582 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 583 584 if (!UnsafeFPMath) { 585 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 586 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 587 } 588 } else if (!UseSoftFloat) { 589 // f32 and f64 in x87. 590 // Set up the FP register classes. 591 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 592 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 593 594 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 595 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 596 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 597 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 598 599 if (!UnsafeFPMath) { 600 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 601 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 602 } 603 addLegalFPImmediate(APFloat(+0.0)); // FLD0 604 addLegalFPImmediate(APFloat(+1.0)); // FLD1 605 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 606 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 607 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 608 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 609 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 610 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 611 } 612 613 // We don't support FMA. 614 setOperationAction(ISD::FMA, MVT::f64, Expand); 615 setOperationAction(ISD::FMA, MVT::f32, Expand); 616 617 // Long double always uses X87. 618 if (!UseSoftFloat) { 619 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 620 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 621 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 622 { 623 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); 624 addLegalFPImmediate(TmpFlt); // FLD0 625 TmpFlt.changeSign(); 626 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 627 628 bool ignored; 629 APFloat TmpFlt2(+1.0); 630 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 631 &ignored); 632 addLegalFPImmediate(TmpFlt2); // FLD1 633 TmpFlt2.changeSign(); 634 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 635 } 636 637 if (!UnsafeFPMath) { 638 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 639 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 640 } 641 642 setOperationAction(ISD::FMA, MVT::f80, Expand); 643 } 644 645 // Always use a library call for pow. 646 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 647 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 648 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 649 650 setOperationAction(ISD::FLOG, MVT::f80, Expand); 651 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 652 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 653 setOperationAction(ISD::FEXP, MVT::f80, Expand); 654 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 655 656 // First set operation action for all vector types to either promote 657 // (for widening) or expand (for scalarization). Then we will selectively 658 // turn on ones that can be effectively codegen'd. 659 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 660 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 661 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 662 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 663 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 664 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 665 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 666 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 667 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 668 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 669 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 670 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 671 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 672 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 673 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 674 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 675 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 676 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 677 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 678 setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 679 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 680 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 681 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 682 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 683 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 684 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 685 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 686 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 687 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 688 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 689 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 690 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 691 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 692 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 693 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 694 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 695 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 696 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 697 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 698 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 699 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 700 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 701 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 702 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 703 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 704 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 705 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 706 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 707 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 708 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 709 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 710 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 711 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 712 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 713 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 714 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 715 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 716 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 717 setTruncStoreAction((MVT::SimpleValueType)VT, 718 (MVT::SimpleValueType)InnerVT, Expand); 719 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 720 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 721 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 722 } 723 724 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 725 // with -msoft-float, disable use of MMX as well. 726 if (!UseSoftFloat && Subtarget->hasMMX()) { 727 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 728 // No operations on x86mmx supported, everything uses intrinsics. 729 } 730 731 // MMX-sized vectors (other than x86mmx) are expected to be expanded 732 // into smaller operations. 733 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 734 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 735 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 736 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 737 setOperationAction(ISD::AND, MVT::v8i8, Expand); 738 setOperationAction(ISD::AND, MVT::v4i16, Expand); 739 setOperationAction(ISD::AND, MVT::v2i32, Expand); 740 setOperationAction(ISD::AND, MVT::v1i64, Expand); 741 setOperationAction(ISD::OR, MVT::v8i8, Expand); 742 setOperationAction(ISD::OR, MVT::v4i16, Expand); 743 setOperationAction(ISD::OR, MVT::v2i32, Expand); 744 setOperationAction(ISD::OR, MVT::v1i64, Expand); 745 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 746 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 747 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 748 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 750 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 751 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 752 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 753 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 754 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 755 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 756 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 757 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 758 setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); 759 setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); 760 setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); 761 setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); 762 763 if (!UseSoftFloat && Subtarget->hasXMM()) { 764 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 765 766 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 767 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 768 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 769 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 770 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 771 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 772 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 773 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 774 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 775 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 776 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 777 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 778 } 779 780 if (!UseSoftFloat && Subtarget->hasXMMInt()) { 781 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 782 783 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 784 // registers cannot be used even for integer operations. 785 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 786 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 787 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 788 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 789 790 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 791 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 792 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 793 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 794 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 795 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 796 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 797 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 798 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 799 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 800 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 801 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 802 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 803 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 804 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 805 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 806 807 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 808 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 809 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 810 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 811 812 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 813 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 814 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 819 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 820 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 821 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 822 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 823 824 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 825 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 826 EVT VT = (MVT::SimpleValueType)i; 827 // Do not attempt to custom lower non-power-of-2 vectors 828 if (!isPowerOf2_32(VT.getVectorNumElements())) 829 continue; 830 // Do not attempt to custom lower non-128-bit vectors 831 if (!VT.is128BitVector()) 832 continue; 833 setOperationAction(ISD::BUILD_VECTOR, 834 VT.getSimpleVT().SimpleTy, Custom); 835 setOperationAction(ISD::VECTOR_SHUFFLE, 836 VT.getSimpleVT().SimpleTy, Custom); 837 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 838 VT.getSimpleVT().SimpleTy, Custom); 839 } 840 841 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 842 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 843 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 844 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 847 848 if (Subtarget->is64Bit()) { 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 851 } 852 853 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 854 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 855 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 856 EVT VT = SVT; 857 858 // Do not attempt to promote non-128-bit vectors 859 if (!VT.is128BitVector()) 860 continue; 861 862 setOperationAction(ISD::AND, SVT, Promote); 863 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 864 setOperationAction(ISD::OR, SVT, Promote); 865 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 866 setOperationAction(ISD::XOR, SVT, Promote); 867 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 868 setOperationAction(ISD::LOAD, SVT, Promote); 869 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 870 setOperationAction(ISD::SELECT, SVT, Promote); 871 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 872 } 873 874 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 875 876 // Custom lower v2i64 and v2f64 selects. 877 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 878 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 879 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 880 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 881 882 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 883 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 884 } 885 886 if (Subtarget->hasSSE41()) { 887 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 888 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 889 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 890 setOperationAction(ISD::FRINT, MVT::f32, Legal); 891 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 892 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 893 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 894 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 895 setOperationAction(ISD::FRINT, MVT::f64, Legal); 896 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 897 898 // FIXME: Do we need to handle scalar-to-vector here? 899 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 900 901 // Can turn SHL into an integer multiply. 902 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 903 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 904 905 // i8 and i16 vectors are custom , because the source register and source 906 // source memory operand types are not the same width. f32 vectors are 907 // custom since the immediate controlling the insert encodes additional 908 // information. 909 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 913 914 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 915 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 916 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 917 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 918 919 if (Subtarget->is64Bit()) { 920 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 921 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 922 } 923 } 924 925 if (Subtarget->hasSSE2()) { 926 setOperationAction(ISD::SRL, MVT::v2i64, Custom); 927 setOperationAction(ISD::SRL, MVT::v4i32, Custom); 928 setOperationAction(ISD::SRL, MVT::v16i8, Custom); 929 930 setOperationAction(ISD::SHL, MVT::v2i64, Custom); 931 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 932 setOperationAction(ISD::SHL, MVT::v8i16, Custom); 933 934 setOperationAction(ISD::SRA, MVT::v4i32, Custom); 935 setOperationAction(ISD::SRA, MVT::v8i16, Custom); 936 } 937 938 if (Subtarget->hasSSE42()) 939 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 940 941 if (!UseSoftFloat && Subtarget->hasAVX()) { 942 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 943 addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); 944 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 945 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 946 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 947 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 948 949 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 950 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 951 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 952 953 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 954 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 955 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 956 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 957 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 958 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 959 960 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 961 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 962 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 963 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 964 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 965 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 966 967 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); 968 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); 969 970 // Custom lower several nodes for 256-bit types. 971 for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 972 i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { 973 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 974 EVT VT = SVT; 975 976 // Extract subvector is special because the value type 977 // (result) is 128-bit but the source is 256-bit wide. 978 if (VT.is128BitVector()) 979 setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); 980 981 // Do not attempt to custom lower other non-256-bit vectors 982 if (!VT.is256BitVector()) 983 continue; 984 985 setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); 986 setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); 987 setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); 988 setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); 989 setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); 990 setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); 991 } 992 993 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. 994 for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { 995 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 996 EVT VT = SVT; 997 998 // Do not attempt to promote non-256-bit vectors 999 if (!VT.is256BitVector()) 1000 continue; 1001 1002 setOperationAction(ISD::AND, SVT, Promote); 1003 AddPromotedToType (ISD::AND, SVT, MVT::v4i64); 1004 setOperationAction(ISD::OR, SVT, Promote); 1005 AddPromotedToType (ISD::OR, SVT, MVT::v4i64); 1006 setOperationAction(ISD::XOR, SVT, Promote); 1007 AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); 1008 setOperationAction(ISD::LOAD, SVT, Promote); 1009 AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); 1010 setOperationAction(ISD::SELECT, SVT, Promote); 1011 AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); 1012 } 1013 } 1014 1015 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion 1016 // of this type with custom code. 1017 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 1018 VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { 1019 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); 1020 } 1021 1022 // We want to custom lower some of our intrinsics. 1023 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1024 1025 1026 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 1027 // handle type legalization for these operations here. 1028 // 1029 // FIXME: We really should do custom legalization for addition and 1030 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 1031 // than generic legalization for 64-bit multiplication-with-overflow, though. 1032 for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { 1033 // Add/Sub/Mul with overflow operations are custom lowered. 1034 MVT VT = IntVTs[i]; 1035 setOperationAction(ISD::SADDO, VT, Custom); 1036 setOperationAction(ISD::UADDO, VT, Custom); 1037 setOperationAction(ISD::SSUBO, VT, Custom); 1038 setOperationAction(ISD::USUBO, VT, Custom); 1039 setOperationAction(ISD::SMULO, VT, Custom); 1040 setOperationAction(ISD::UMULO, VT, Custom); 1041 } 1042 1043 // There are no 8-bit 3-address imul/mul instructions 1044 setOperationAction(ISD::SMULO, MVT::i8, Expand); 1045 setOperationAction(ISD::UMULO, MVT::i8, Expand); 1046 1047 if (!Subtarget->is64Bit()) { 1048 // These libcalls are not available in 32-bit. 1049 setLibcallName(RTLIB::SHL_I128, 0); 1050 setLibcallName(RTLIB::SRL_I128, 0); 1051 setLibcallName(RTLIB::SRA_I128, 0); 1052 } 1053 1054 // We have target-specific dag combine patterns for the following nodes: 1055 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1056 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1057 setTargetDAGCombine(ISD::BUILD_VECTOR); 1058 setTargetDAGCombine(ISD::SELECT); 1059 setTargetDAGCombine(ISD::SHL); 1060 setTargetDAGCombine(ISD::SRA); 1061 setTargetDAGCombine(ISD::SRL); 1062 setTargetDAGCombine(ISD::OR); 1063 setTargetDAGCombine(ISD::AND); 1064 setTargetDAGCombine(ISD::ADD); 1065 setTargetDAGCombine(ISD::SUB); 1066 setTargetDAGCombine(ISD::STORE); 1067 setTargetDAGCombine(ISD::ZERO_EXTEND); 1068 setTargetDAGCombine(ISD::SINT_TO_FP); 1069 if (Subtarget->is64Bit()) 1070 setTargetDAGCombine(ISD::MUL); 1071 1072 computeRegisterProperties(); 1073 1074 // On Darwin, -Os means optimize for size without hurting performance, 1075 // do not reduce the limit. 1076 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1077 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; 1078 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1079 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1080 maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores 1081 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; 1082 setPrefLoopAlignment(16); 1083 benefitFromCodePlacementOpt = true; 1084 1085 setPrefFunctionAlignment(4); 1086} 1087 1088 1089MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1090 return MVT::i8; 1091} 1092 1093 1094/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1095/// the desired ByVal argument alignment. 1096static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { 1097 if (MaxAlign == 16) 1098 return; 1099 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1100 if (VTy->getBitWidth() == 128) 1101 MaxAlign = 16; 1102 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1103 unsigned EltAlign = 0; 1104 getMaxByValAlign(ATy->getElementType(), EltAlign); 1105 if (EltAlign > MaxAlign) 1106 MaxAlign = EltAlign; 1107 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 1108 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1109 unsigned EltAlign = 0; 1110 getMaxByValAlign(STy->getElementType(i), EltAlign); 1111 if (EltAlign > MaxAlign) 1112 MaxAlign = EltAlign; 1113 if (MaxAlign == 16) 1114 break; 1115 } 1116 } 1117 return; 1118} 1119 1120/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1121/// function arguments in the caller parameter area. For X86, aggregates 1122/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1123/// are at 4-byte boundaries. 1124unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { 1125 if (Subtarget->is64Bit()) { 1126 // Max of 8 and alignment of type. 1127 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1128 if (TyAlign > 8) 1129 return TyAlign; 1130 return 8; 1131 } 1132 1133 unsigned Align = 4; 1134 if (Subtarget->hasXMM()) 1135 getMaxByValAlign(Ty, Align); 1136 return Align; 1137} 1138 1139/// getOptimalMemOpType - Returns the target specific optimal type for load 1140/// and store operations as a result of memset, memcpy, and memmove 1141/// lowering. If DstAlign is zero that means it's safe to destination 1142/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1143/// means there isn't a need to check it against alignment requirement, 1144/// probably because the source does not need to be loaded. If 1145/// 'NonScalarIntSafe' is true, that means it's safe to return a 1146/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1147/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1148/// constant so it does not need to be loaded. 1149/// It returns EVT::Other if the type should be determined using generic 1150/// target-independent logic. 1151EVT 1152X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1153 unsigned DstAlign, unsigned SrcAlign, 1154 bool NonScalarIntSafe, 1155 bool MemcpyStrSrc, 1156 MachineFunction &MF) const { 1157 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1158 // linux. This is because the stack realignment code can't handle certain 1159 // cases like PR2962. This should be removed when PR2962 is fixed. 1160 const Function *F = MF.getFunction(); 1161 if (NonScalarIntSafe && 1162 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1163 if (Size >= 16 && 1164 (Subtarget->isUnalignedMemAccessFast() || 1165 ((DstAlign == 0 || DstAlign >= 16) && 1166 (SrcAlign == 0 || SrcAlign >= 16))) && 1167 Subtarget->getStackAlignment() >= 16) { 1168 if (Subtarget->hasSSE2()) 1169 return MVT::v4i32; 1170 if (Subtarget->hasSSE1()) 1171 return MVT::v4f32; 1172 } else if (!MemcpyStrSrc && Size >= 8 && 1173 !Subtarget->is64Bit() && 1174 Subtarget->getStackAlignment() >= 8 && 1175 Subtarget->hasXMMInt()) { 1176 // Do not use f64 to lower memcpy if source is string constant. It's 1177 // better to use i32 to avoid the loads. 1178 return MVT::f64; 1179 } 1180 } 1181 if (Subtarget->is64Bit() && Size >= 8) 1182 return MVT::i64; 1183 return MVT::i32; 1184} 1185 1186/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1187/// current function. The returned value is a member of the 1188/// MachineJumpTableInfo::JTEntryKind enum. 1189unsigned X86TargetLowering::getJumpTableEncoding() const { 1190 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1191 // symbol. 1192 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1193 Subtarget->isPICStyleGOT()) 1194 return MachineJumpTableInfo::EK_Custom32; 1195 1196 // Otherwise, use the normal jump table encoding heuristics. 1197 return TargetLowering::getJumpTableEncoding(); 1198} 1199 1200const MCExpr * 1201X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1202 const MachineBasicBlock *MBB, 1203 unsigned uid,MCContext &Ctx) const{ 1204 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1205 Subtarget->isPICStyleGOT()); 1206 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1207 // entries. 1208 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1209 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1210} 1211 1212/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1213/// jumptable. 1214SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1215 SelectionDAG &DAG) const { 1216 if (!Subtarget->is64Bit()) 1217 // This doesn't have DebugLoc associated with it, but is not really the 1218 // same as a Register. 1219 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1220 return Table; 1221} 1222 1223/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1224/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1225/// MCExpr. 1226const MCExpr *X86TargetLowering:: 1227getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1228 MCContext &Ctx) const { 1229 // X86-64 uses RIP relative addressing based on the jump table label. 1230 if (Subtarget->isPICStyleRIPRel()) 1231 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1232 1233 // Otherwise, the reference is relative to the PIC base. 1234 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1235} 1236 1237// FIXME: Why this routine is here? Move to RegInfo! 1238std::pair<const TargetRegisterClass*, uint8_t> 1239X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1240 const TargetRegisterClass *RRC = 0; 1241 uint8_t Cost = 1; 1242 switch (VT.getSimpleVT().SimpleTy) { 1243 default: 1244 return TargetLowering::findRepresentativeClass(VT); 1245 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1246 RRC = (Subtarget->is64Bit() 1247 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1248 break; 1249 case MVT::x86mmx: 1250 RRC = X86::VR64RegisterClass; 1251 break; 1252 case MVT::f32: case MVT::f64: 1253 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1254 case MVT::v4f32: case MVT::v2f64: 1255 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1256 case MVT::v4f64: 1257 RRC = X86::VR128RegisterClass; 1258 break; 1259 } 1260 return std::make_pair(RRC, Cost); 1261} 1262 1263bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1264 unsigned &Offset) const { 1265 if (!Subtarget->isTargetLinux()) 1266 return false; 1267 1268 if (Subtarget->is64Bit()) { 1269 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1270 Offset = 0x28; 1271 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1272 AddressSpace = 256; 1273 else 1274 AddressSpace = 257; 1275 } else { 1276 // %gs:0x14 on i386 1277 Offset = 0x14; 1278 AddressSpace = 256; 1279 } 1280 return true; 1281} 1282 1283 1284//===----------------------------------------------------------------------===// 1285// Return Value Calling Convention Implementation 1286//===----------------------------------------------------------------------===// 1287 1288#include "X86GenCallingConv.inc" 1289 1290bool 1291X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, 1292 MachineFunction &MF, bool isVarArg, 1293 const SmallVectorImpl<ISD::OutputArg> &Outs, 1294 LLVMContext &Context) const { 1295 SmallVector<CCValAssign, 16> RVLocs; 1296 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1297 RVLocs, Context); 1298 return CCInfo.CheckReturn(Outs, RetCC_X86); 1299} 1300 1301SDValue 1302X86TargetLowering::LowerReturn(SDValue Chain, 1303 CallingConv::ID CallConv, bool isVarArg, 1304 const SmallVectorImpl<ISD::OutputArg> &Outs, 1305 const SmallVectorImpl<SDValue> &OutVals, 1306 DebugLoc dl, SelectionDAG &DAG) const { 1307 MachineFunction &MF = DAG.getMachineFunction(); 1308 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1309 1310 SmallVector<CCValAssign, 16> RVLocs; 1311 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1312 RVLocs, *DAG.getContext()); 1313 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1314 1315 // Add the regs to the liveout set for the function. 1316 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1317 for (unsigned i = 0; i != RVLocs.size(); ++i) 1318 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1319 MRI.addLiveOut(RVLocs[i].getLocReg()); 1320 1321 SDValue Flag; 1322 1323 SmallVector<SDValue, 6> RetOps; 1324 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1325 // Operand #1 = Bytes To Pop 1326 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1327 MVT::i16)); 1328 1329 // Copy the result values into the output registers. 1330 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1331 CCValAssign &VA = RVLocs[i]; 1332 assert(VA.isRegLoc() && "Can only return in registers!"); 1333 SDValue ValToCopy = OutVals[i]; 1334 EVT ValVT = ValToCopy.getValueType(); 1335 1336 // If this is x86-64, and we disabled SSE, we can't return FP values, 1337 // or SSE or MMX vectors. 1338 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1339 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1340 (Subtarget->is64Bit() && !Subtarget->hasXMM())) { 1341 report_fatal_error("SSE register return with SSE disabled"); 1342 } 1343 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1344 // llvm-gcc has never done it right and no one has noticed, so this 1345 // should be OK for now. 1346 if (ValVT == MVT::f64 && 1347 (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) 1348 report_fatal_error("SSE2 register return with SSE2 disabled"); 1349 1350 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1351 // the RET instruction and handled by the FP Stackifier. 1352 if (VA.getLocReg() == X86::ST0 || 1353 VA.getLocReg() == X86::ST1) { 1354 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1355 // change the value to the FP stack register class. 1356 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1357 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1358 RetOps.push_back(ValToCopy); 1359 // Don't emit a copytoreg. 1360 continue; 1361 } 1362 1363 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1364 // which is returned in RAX / RDX. 1365 if (Subtarget->is64Bit()) { 1366 if (ValVT == MVT::x86mmx) { 1367 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1368 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); 1369 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1370 ValToCopy); 1371 // If we don't have SSE2 available, convert to v4f32 so the generated 1372 // register is legal. 1373 if (!Subtarget->hasSSE2()) 1374 ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); 1375 } 1376 } 1377 } 1378 1379 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1380 Flag = Chain.getValue(1); 1381 } 1382 1383 // The x86-64 ABI for returning structs by value requires that we copy 1384 // the sret argument into %rax for the return. We saved the argument into 1385 // a virtual register in the entry block, so now we copy the value out 1386 // and into %rax. 1387 if (Subtarget->is64Bit() && 1388 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1389 MachineFunction &MF = DAG.getMachineFunction(); 1390 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1391 unsigned Reg = FuncInfo->getSRetReturnReg(); 1392 assert(Reg && 1393 "SRetReturnReg should have been set in LowerFormalArguments()."); 1394 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1395 1396 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1397 Flag = Chain.getValue(1); 1398 1399 // RAX now acts like a return value. 1400 MRI.addLiveOut(X86::RAX); 1401 } 1402 1403 RetOps[0] = Chain; // Update chain. 1404 1405 // Add the flag if we have it. 1406 if (Flag.getNode()) 1407 RetOps.push_back(Flag); 1408 1409 return DAG.getNode(X86ISD::RET_FLAG, dl, 1410 MVT::Other, &RetOps[0], RetOps.size()); 1411} 1412 1413bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { 1414 if (N->getNumValues() != 1) 1415 return false; 1416 if (!N->hasNUsesOfValue(1, 0)) 1417 return false; 1418 1419 SDNode *Copy = *N->use_begin(); 1420 if (Copy->getOpcode() != ISD::CopyToReg && 1421 Copy->getOpcode() != ISD::FP_EXTEND) 1422 return false; 1423 1424 bool HasRet = false; 1425 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 1426 UI != UE; ++UI) { 1427 if (UI->getOpcode() != X86ISD::RET_FLAG) 1428 return false; 1429 HasRet = true; 1430 } 1431 1432 return HasRet; 1433} 1434 1435EVT 1436X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, 1437 ISD::NodeType ExtendKind) const { 1438 MVT ReturnMVT; 1439 // TODO: Is this also valid on 32-bit? 1440 if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) 1441 ReturnMVT = MVT::i8; 1442 else 1443 ReturnMVT = MVT::i32; 1444 1445 EVT MinVT = getRegisterType(Context, ReturnMVT); 1446 return VT.bitsLT(MinVT) ? MinVT : VT; 1447} 1448 1449/// LowerCallResult - Lower the result values of a call into the 1450/// appropriate copies out of appropriate physical registers. 1451/// 1452SDValue 1453X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1454 CallingConv::ID CallConv, bool isVarArg, 1455 const SmallVectorImpl<ISD::InputArg> &Ins, 1456 DebugLoc dl, SelectionDAG &DAG, 1457 SmallVectorImpl<SDValue> &InVals) const { 1458 1459 // Assign locations to each value returned by this call. 1460 SmallVector<CCValAssign, 16> RVLocs; 1461 bool Is64Bit = Subtarget->is64Bit(); 1462 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1463 getTargetMachine(), RVLocs, *DAG.getContext()); 1464 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1465 1466 // Copy all of the result registers out of their specified physreg. 1467 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1468 CCValAssign &VA = RVLocs[i]; 1469 EVT CopyVT = VA.getValVT(); 1470 1471 // If this is x86-64, and we disabled SSE, we can't return FP values 1472 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1473 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { 1474 report_fatal_error("SSE register return with SSE disabled"); 1475 } 1476 1477 SDValue Val; 1478 1479 // If this is a call to a function that returns an fp value on the floating 1480 // point stack, we must guarantee the the value is popped from the stack, so 1481 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1482 // if the return value is not used. We use the FpPOP_RETVAL instruction 1483 // instead. 1484 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1485 // If we prefer to use the value in xmm registers, copy it out as f80 and 1486 // use a truncate to move it from fp stack reg to xmm reg. 1487 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1488 SDValue Ops[] = { Chain, InFlag }; 1489 Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, 1490 MVT::Other, MVT::Glue, Ops, 2), 1); 1491 Val = Chain.getValue(0); 1492 1493 // Round the f80 to the right size, which also moves it to the appropriate 1494 // xmm register. 1495 if (CopyVT != VA.getValVT()) 1496 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1497 // This truncation won't change the value. 1498 DAG.getIntPtrConstant(1)); 1499 } else { 1500 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1501 CopyVT, InFlag).getValue(1); 1502 Val = Chain.getValue(0); 1503 } 1504 InFlag = Chain.getValue(2); 1505 InVals.push_back(Val); 1506 } 1507 1508 return Chain; 1509} 1510 1511 1512//===----------------------------------------------------------------------===// 1513// C & StdCall & Fast Calling Convention implementation 1514//===----------------------------------------------------------------------===// 1515// StdCall calling convention seems to be standard for many Windows' API 1516// routines and around. It differs from C calling convention just a little: 1517// callee should clean up the stack, not caller. Symbols should be also 1518// decorated in some fancy way :) It doesn't support any vector arguments. 1519// For info on fast calling convention see Fast Calling Convention (tail call) 1520// implementation LowerX86_32FastCCCallTo. 1521 1522/// CallIsStructReturn - Determines whether a call uses struct return 1523/// semantics. 1524static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1525 if (Outs.empty()) 1526 return false; 1527 1528 return Outs[0].Flags.isSRet(); 1529} 1530 1531/// ArgsAreStructReturn - Determines whether a function uses struct 1532/// return semantics. 1533static bool 1534ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1535 if (Ins.empty()) 1536 return false; 1537 1538 return Ins[0].Flags.isSRet(); 1539} 1540 1541/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1542/// by "Src" to address "Dst" with size and alignment information specified by 1543/// the specific parameter attribute. The copy will be passed as a byval 1544/// function parameter. 1545static SDValue 1546CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1547 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1548 DebugLoc dl) { 1549 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1550 1551 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1552 /*isVolatile*/false, /*AlwaysInline=*/true, 1553 MachinePointerInfo(), MachinePointerInfo()); 1554} 1555 1556/// IsTailCallConvention - Return true if the calling convention is one that 1557/// supports tail call optimization. 1558static bool IsTailCallConvention(CallingConv::ID CC) { 1559 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1560} 1561 1562bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 1563 if (!CI->isTailCall()) 1564 return false; 1565 1566 CallSite CS(CI); 1567 CallingConv::ID CalleeCC = CS.getCallingConv(); 1568 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 1569 return false; 1570 1571 return true; 1572} 1573 1574/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1575/// a tailcall target by changing its ABI. 1576static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1577 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1578} 1579 1580SDValue 1581X86TargetLowering::LowerMemArgument(SDValue Chain, 1582 CallingConv::ID CallConv, 1583 const SmallVectorImpl<ISD::InputArg> &Ins, 1584 DebugLoc dl, SelectionDAG &DAG, 1585 const CCValAssign &VA, 1586 MachineFrameInfo *MFI, 1587 unsigned i) const { 1588 // Create the nodes corresponding to a load from this parameter slot. 1589 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1590 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1591 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1592 EVT ValVT; 1593 1594 // If value is passed by pointer we have address passed instead of the value 1595 // itself. 1596 if (VA.getLocInfo() == CCValAssign::Indirect) 1597 ValVT = VA.getLocVT(); 1598 else 1599 ValVT = VA.getValVT(); 1600 1601 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1602 // changed with more analysis. 1603 // In case of tail call optimization mark all arguments mutable. Since they 1604 // could be overwritten by lowering of arguments in case of a tail call. 1605 if (Flags.isByVal()) { 1606 unsigned Bytes = Flags.getByValSize(); 1607 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. 1608 int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); 1609 return DAG.getFrameIndex(FI, getPointerTy()); 1610 } else { 1611 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1612 VA.getLocMemOffset(), isImmutable); 1613 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1614 return DAG.getLoad(ValVT, dl, Chain, FIN, 1615 MachinePointerInfo::getFixedStack(FI), 1616 false, false, 0); 1617 } 1618} 1619 1620SDValue 1621X86TargetLowering::LowerFormalArguments(SDValue Chain, 1622 CallingConv::ID CallConv, 1623 bool isVarArg, 1624 const SmallVectorImpl<ISD::InputArg> &Ins, 1625 DebugLoc dl, 1626 SelectionDAG &DAG, 1627 SmallVectorImpl<SDValue> &InVals) 1628 const { 1629 MachineFunction &MF = DAG.getMachineFunction(); 1630 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1631 1632 const Function* Fn = MF.getFunction(); 1633 if (Fn->hasExternalLinkage() && 1634 Subtarget->isTargetCygMing() && 1635 Fn->getName() == "main") 1636 FuncInfo->setForceFramePointer(true); 1637 1638 MachineFrameInfo *MFI = MF.getFrameInfo(); 1639 bool Is64Bit = Subtarget->is64Bit(); 1640 bool IsWin64 = Subtarget->isTargetWin64(); 1641 1642 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1643 "Var args not supported with calling convention fastcc or ghc"); 1644 1645 // Assign locations to all of the incoming arguments. 1646 SmallVector<CCValAssign, 16> ArgLocs; 1647 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1648 ArgLocs, *DAG.getContext()); 1649 1650 // Allocate shadow area for Win64 1651 if (IsWin64) { 1652 CCInfo.AllocateStack(32, 8); 1653 } 1654 1655 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1656 1657 unsigned LastVal = ~0U; 1658 SDValue ArgValue; 1659 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1660 CCValAssign &VA = ArgLocs[i]; 1661 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1662 // places. 1663 assert(VA.getValNo() != LastVal && 1664 "Don't support value assigned to multiple locs yet"); 1665 LastVal = VA.getValNo(); 1666 1667 if (VA.isRegLoc()) { 1668 EVT RegVT = VA.getLocVT(); 1669 TargetRegisterClass *RC = NULL; 1670 if (RegVT == MVT::i32) 1671 RC = X86::GR32RegisterClass; 1672 else if (Is64Bit && RegVT == MVT::i64) 1673 RC = X86::GR64RegisterClass; 1674 else if (RegVT == MVT::f32) 1675 RC = X86::FR32RegisterClass; 1676 else if (RegVT == MVT::f64) 1677 RC = X86::FR64RegisterClass; 1678 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1679 RC = X86::VR256RegisterClass; 1680 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1681 RC = X86::VR128RegisterClass; 1682 else if (RegVT == MVT::x86mmx) 1683 RC = X86::VR64RegisterClass; 1684 else 1685 llvm_unreachable("Unknown argument type!"); 1686 1687 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1688 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1689 1690 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1691 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1692 // right size. 1693 if (VA.getLocInfo() == CCValAssign::SExt) 1694 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1695 DAG.getValueType(VA.getValVT())); 1696 else if (VA.getLocInfo() == CCValAssign::ZExt) 1697 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1698 DAG.getValueType(VA.getValVT())); 1699 else if (VA.getLocInfo() == CCValAssign::BCvt) 1700 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 1701 1702 if (VA.isExtInLoc()) { 1703 // Handle MMX values passed in XMM regs. 1704 if (RegVT.isVector()) { 1705 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1706 ArgValue); 1707 } else 1708 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1709 } 1710 } else { 1711 assert(VA.isMemLoc()); 1712 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1713 } 1714 1715 // If value is passed via pointer - do a load. 1716 if (VA.getLocInfo() == CCValAssign::Indirect) 1717 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1718 MachinePointerInfo(), false, false, 0); 1719 1720 InVals.push_back(ArgValue); 1721 } 1722 1723 // The x86-64 ABI for returning structs by value requires that we copy 1724 // the sret argument into %rax for the return. Save the argument into 1725 // a virtual register so that we can access it from the return points. 1726 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1727 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1728 unsigned Reg = FuncInfo->getSRetReturnReg(); 1729 if (!Reg) { 1730 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1731 FuncInfo->setSRetReturnReg(Reg); 1732 } 1733 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1734 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1735 } 1736 1737 unsigned StackSize = CCInfo.getNextStackOffset(); 1738 // Align stack specially for tail calls. 1739 if (FuncIsMadeTailCallSafe(CallConv)) 1740 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1741 1742 // If the function takes variable number of arguments, make a frame index for 1743 // the start of the first vararg value... for expansion of llvm.va_start. 1744 if (isVarArg) { 1745 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1746 CallConv != CallingConv::X86_ThisCall)) { 1747 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1748 } 1749 if (Is64Bit) { 1750 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1751 1752 // FIXME: We should really autogenerate these arrays 1753 static const unsigned GPR64ArgRegsWin64[] = { 1754 X86::RCX, X86::RDX, X86::R8, X86::R9 1755 }; 1756 static const unsigned GPR64ArgRegs64Bit[] = { 1757 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1758 }; 1759 static const unsigned XMMArgRegs64Bit[] = { 1760 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1761 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1762 }; 1763 const unsigned *GPR64ArgRegs; 1764 unsigned NumXMMRegs = 0; 1765 1766 if (IsWin64) { 1767 // The XMM registers which might contain var arg parameters are shadowed 1768 // in their paired GPR. So we only need to save the GPR to their home 1769 // slots. 1770 TotalNumIntRegs = 4; 1771 GPR64ArgRegs = GPR64ArgRegsWin64; 1772 } else { 1773 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1774 GPR64ArgRegs = GPR64ArgRegs64Bit; 1775 1776 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1777 } 1778 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1779 TotalNumIntRegs); 1780 1781 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1782 assert(!(NumXMMRegs && !Subtarget->hasXMM()) && 1783 "SSE register cannot be used when SSE is disabled!"); 1784 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1785 "SSE register cannot be used when SSE is disabled!"); 1786 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) 1787 // Kernel mode asks for SSE to be disabled, so don't push them 1788 // on the stack. 1789 TotalNumXMMRegs = 0; 1790 1791 if (IsWin64) { 1792 const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); 1793 // Get to the caller-allocated home save location. Add 8 to account 1794 // for the return address. 1795 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1796 FuncInfo->setRegSaveFrameIndex( 1797 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1798 // Fixup to set vararg frame on shadow area (4 x i64). 1799 if (NumIntRegs < 4) 1800 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1801 } else { 1802 // For X86-64, if there are vararg parameters that are passed via 1803 // registers, then we must store them to their spots on the stack so they 1804 // may be loaded by deferencing the result of va_next. 1805 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1806 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1807 FuncInfo->setRegSaveFrameIndex( 1808 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1809 false)); 1810 } 1811 1812 // Store the integer parameter registers. 1813 SmallVector<SDValue, 8> MemOps; 1814 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1815 getPointerTy()); 1816 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1817 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1818 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1819 DAG.getIntPtrConstant(Offset)); 1820 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1821 X86::GR64RegisterClass); 1822 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1823 SDValue Store = 1824 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1825 MachinePointerInfo::getFixedStack( 1826 FuncInfo->getRegSaveFrameIndex(), Offset), 1827 false, false, 0); 1828 MemOps.push_back(Store); 1829 Offset += 8; 1830 } 1831 1832 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1833 // Now store the XMM (fp + vector) parameter registers. 1834 SmallVector<SDValue, 11> SaveXMMOps; 1835 SaveXMMOps.push_back(Chain); 1836 1837 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1838 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1839 SaveXMMOps.push_back(ALVal); 1840 1841 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1842 FuncInfo->getRegSaveFrameIndex())); 1843 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1844 FuncInfo->getVarArgsFPOffset())); 1845 1846 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1847 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1848 X86::VR128RegisterClass); 1849 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1850 SaveXMMOps.push_back(Val); 1851 } 1852 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1853 MVT::Other, 1854 &SaveXMMOps[0], SaveXMMOps.size())); 1855 } 1856 1857 if (!MemOps.empty()) 1858 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1859 &MemOps[0], MemOps.size()); 1860 } 1861 } 1862 1863 // Some CCs need callee pop. 1864 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { 1865 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1866 } else { 1867 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1868 // If this is an sret function, the return should pop the hidden pointer. 1869 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1870 FuncInfo->setBytesToPopOnReturn(4); 1871 } 1872 1873 if (!Is64Bit) { 1874 // RegSaveFrameIndex is X86-64 only. 1875 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1876 if (CallConv == CallingConv::X86_FastCall || 1877 CallConv == CallingConv::X86_ThisCall) 1878 // fastcc functions can't have varargs. 1879 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1880 } 1881 1882 return Chain; 1883} 1884 1885SDValue 1886X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1887 SDValue StackPtr, SDValue Arg, 1888 DebugLoc dl, SelectionDAG &DAG, 1889 const CCValAssign &VA, 1890 ISD::ArgFlagsTy Flags) const { 1891 unsigned LocMemOffset = VA.getLocMemOffset(); 1892 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1893 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1894 if (Flags.isByVal()) 1895 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1896 1897 return DAG.getStore(Chain, dl, Arg, PtrOff, 1898 MachinePointerInfo::getStack(LocMemOffset), 1899 false, false, 0); 1900} 1901 1902/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1903/// optimization is performed and it is required. 1904SDValue 1905X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1906 SDValue &OutRetAddr, SDValue Chain, 1907 bool IsTailCall, bool Is64Bit, 1908 int FPDiff, DebugLoc dl) const { 1909 // Adjust the Return address stack slot. 1910 EVT VT = getPointerTy(); 1911 OutRetAddr = getReturnAddressFrameIndex(DAG); 1912 1913 // Load the "old" Return address. 1914 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1915 false, false, 0); 1916 return SDValue(OutRetAddr.getNode(), 1); 1917} 1918 1919/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call 1920/// optimization is performed and it is required (FPDiff!=0). 1921static SDValue 1922EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1923 SDValue Chain, SDValue RetAddrFrIdx, 1924 bool Is64Bit, int FPDiff, DebugLoc dl) { 1925 // Store the return address to the appropriate stack slot. 1926 if (!FPDiff) return Chain; 1927 // Calculate the new stack slot for the return address. 1928 int SlotSize = Is64Bit ? 8 : 4; 1929 int NewReturnAddrFI = 1930 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1931 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1932 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1933 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1934 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1935 false, false, 0); 1936 return Chain; 1937} 1938 1939SDValue 1940X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1941 CallingConv::ID CallConv, bool isVarArg, 1942 bool &isTailCall, 1943 const SmallVectorImpl<ISD::OutputArg> &Outs, 1944 const SmallVectorImpl<SDValue> &OutVals, 1945 const SmallVectorImpl<ISD::InputArg> &Ins, 1946 DebugLoc dl, SelectionDAG &DAG, 1947 SmallVectorImpl<SDValue> &InVals) const { 1948 MachineFunction &MF = DAG.getMachineFunction(); 1949 bool Is64Bit = Subtarget->is64Bit(); 1950 bool IsWin64 = Subtarget->isTargetWin64(); 1951 bool IsStructRet = CallIsStructReturn(Outs); 1952 bool IsSibcall = false; 1953 1954 if (isTailCall) { 1955 // Check if it's really possible to do a tail call. 1956 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1957 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1958 Outs, OutVals, Ins, DAG); 1959 1960 // Sibcalls are automatically detected tailcalls which do not require 1961 // ABI changes. 1962 if (!GuaranteedTailCallOpt && isTailCall) 1963 IsSibcall = true; 1964 1965 if (isTailCall) 1966 ++NumTailCalls; 1967 } 1968 1969 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1970 "Var args not supported with calling convention fastcc or ghc"); 1971 1972 // Analyze operands of the call, assigning locations to each operand. 1973 SmallVector<CCValAssign, 16> ArgLocs; 1974 CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), 1975 ArgLocs, *DAG.getContext()); 1976 1977 // Allocate shadow area for Win64 1978 if (IsWin64) { 1979 CCInfo.AllocateStack(32, 8); 1980 } 1981 1982 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1983 1984 // Get a count of how many bytes are to be pushed on the stack. 1985 unsigned NumBytes = CCInfo.getNextStackOffset(); 1986 if (IsSibcall) 1987 // This is a sibcall. The memory operands are available in caller's 1988 // own caller's stack. 1989 NumBytes = 0; 1990 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1991 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1992 1993 int FPDiff = 0; 1994 if (isTailCall && !IsSibcall) { 1995 // Lower arguments at fp - stackoffset + fpdiff. 1996 unsigned NumBytesCallerPushed = 1997 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1998 FPDiff = NumBytesCallerPushed - NumBytes; 1999 2000 // Set the delta of movement of the returnaddr stackslot. 2001 // But only set if delta is greater than previous delta. 2002 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 2003 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 2004 } 2005 2006 if (!IsSibcall) 2007 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 2008 2009 SDValue RetAddrFrIdx; 2010 // Load return address for tail calls. 2011 if (isTailCall && FPDiff) 2012 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 2013 Is64Bit, FPDiff, dl); 2014 2015 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2016 SmallVector<SDValue, 8> MemOpChains; 2017 SDValue StackPtr; 2018 2019 // Walk the register/memloc assignments, inserting copies/loads. In the case 2020 // of tail call optimization arguments are handle later. 2021 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2022 CCValAssign &VA = ArgLocs[i]; 2023 EVT RegVT = VA.getLocVT(); 2024 SDValue Arg = OutVals[i]; 2025 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2026 bool isByVal = Flags.isByVal(); 2027 2028 // Promote the value if needed. 2029 switch (VA.getLocInfo()) { 2030 default: llvm_unreachable("Unknown loc info!"); 2031 case CCValAssign::Full: break; 2032 case CCValAssign::SExt: 2033 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 2034 break; 2035 case CCValAssign::ZExt: 2036 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 2037 break; 2038 case CCValAssign::AExt: 2039 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 2040 // Special case: passing MMX values in XMM registers. 2041 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); 2042 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 2043 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2044 } else 2045 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2046 break; 2047 case CCValAssign::BCvt: 2048 Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); 2049 break; 2050 case CCValAssign::Indirect: { 2051 // Store the argument. 2052 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2053 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2054 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2055 MachinePointerInfo::getFixedStack(FI), 2056 false, false, 0); 2057 Arg = SpillSlot; 2058 break; 2059 } 2060 } 2061 2062 if (VA.isRegLoc()) { 2063 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2064 if (isVarArg && IsWin64) { 2065 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2066 // shadow reg if callee is a varargs function. 2067 unsigned ShadowReg = 0; 2068 switch (VA.getLocReg()) { 2069 case X86::XMM0: ShadowReg = X86::RCX; break; 2070 case X86::XMM1: ShadowReg = X86::RDX; break; 2071 case X86::XMM2: ShadowReg = X86::R8; break; 2072 case X86::XMM3: ShadowReg = X86::R9; break; 2073 } 2074 if (ShadowReg) 2075 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2076 } 2077 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2078 assert(VA.isMemLoc()); 2079 if (StackPtr.getNode() == 0) 2080 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2081 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2082 dl, DAG, VA, Flags)); 2083 } 2084 } 2085 2086 if (!MemOpChains.empty()) 2087 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2088 &MemOpChains[0], MemOpChains.size()); 2089 2090 // Build a sequence of copy-to-reg nodes chained together with token chain 2091 // and flag operands which copy the outgoing args into registers. 2092 SDValue InFlag; 2093 // Tail call byval lowering might overwrite argument registers so in case of 2094 // tail call optimization the copies to registers are lowered later. 2095 if (!isTailCall) 2096 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2097 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2098 RegsToPass[i].second, InFlag); 2099 InFlag = Chain.getValue(1); 2100 } 2101 2102 if (Subtarget->isPICStyleGOT()) { 2103 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2104 // GOT pointer. 2105 if (!isTailCall) { 2106 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2107 DAG.getNode(X86ISD::GlobalBaseReg, 2108 DebugLoc(), getPointerTy()), 2109 InFlag); 2110 InFlag = Chain.getValue(1); 2111 } else { 2112 // If we are tail calling and generating PIC/GOT style code load the 2113 // address of the callee into ECX. The value in ecx is used as target of 2114 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2115 // for tail calls on PIC/GOT architectures. Normally we would just put the 2116 // address of GOT into ebx and then call target@PLT. But for tail calls 2117 // ebx would be restored (since ebx is callee saved) before jumping to the 2118 // target@PLT. 2119 2120 // Note: The actual moving to ECX is done further down. 2121 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2122 if (G && !G->getGlobal()->hasHiddenVisibility() && 2123 !G->getGlobal()->hasProtectedVisibility()) 2124 Callee = LowerGlobalAddress(Callee, DAG); 2125 else if (isa<ExternalSymbolSDNode>(Callee)) 2126 Callee = LowerExternalSymbol(Callee, DAG); 2127 } 2128 } 2129 2130 if (Is64Bit && isVarArg && !IsWin64) { 2131 // From AMD64 ABI document: 2132 // For calls that may call functions that use varargs or stdargs 2133 // (prototype-less calls or calls to functions containing ellipsis (...) in 2134 // the declaration) %al is used as hidden argument to specify the number 2135 // of SSE registers used. The contents of %al do not need to match exactly 2136 // the number of registers, but must be an ubound on the number of SSE 2137 // registers used and is in the range 0 - 8 inclusive. 2138 2139 // Count the number of XMM registers allocated. 2140 static const unsigned XMMArgRegs[] = { 2141 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2142 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2143 }; 2144 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2145 assert((Subtarget->hasXMM() || !NumXMMRegs) 2146 && "SSE registers cannot be used when SSE is disabled"); 2147 2148 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2149 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2150 InFlag = Chain.getValue(1); 2151 } 2152 2153 2154 // For tail calls lower the arguments to the 'real' stack slot. 2155 if (isTailCall) { 2156 // Force all the incoming stack arguments to be loaded from the stack 2157 // before any new outgoing arguments are stored to the stack, because the 2158 // outgoing stack slots may alias the incoming argument stack slots, and 2159 // the alias isn't otherwise explicit. This is slightly more conservative 2160 // than necessary, because it means that each store effectively depends 2161 // on every argument instead of just those arguments it would clobber. 2162 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2163 2164 SmallVector<SDValue, 8> MemOpChains2; 2165 SDValue FIN; 2166 int FI = 0; 2167 // Do not flag preceding copytoreg stuff together with the following stuff. 2168 InFlag = SDValue(); 2169 if (GuaranteedTailCallOpt) { 2170 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2171 CCValAssign &VA = ArgLocs[i]; 2172 if (VA.isRegLoc()) 2173 continue; 2174 assert(VA.isMemLoc()); 2175 SDValue Arg = OutVals[i]; 2176 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2177 // Create frame index. 2178 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2179 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2180 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2181 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2182 2183 if (Flags.isByVal()) { 2184 // Copy relative to framepointer. 2185 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2186 if (StackPtr.getNode() == 0) 2187 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2188 getPointerTy()); 2189 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2190 2191 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2192 ArgChain, 2193 Flags, DAG, dl)); 2194 } else { 2195 // Store relative to framepointer. 2196 MemOpChains2.push_back( 2197 DAG.getStore(ArgChain, dl, Arg, FIN, 2198 MachinePointerInfo::getFixedStack(FI), 2199 false, false, 0)); 2200 } 2201 } 2202 } 2203 2204 if (!MemOpChains2.empty()) 2205 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2206 &MemOpChains2[0], MemOpChains2.size()); 2207 2208 // Copy arguments to their registers. 2209 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2210 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2211 RegsToPass[i].second, InFlag); 2212 InFlag = Chain.getValue(1); 2213 } 2214 InFlag =SDValue(); 2215 2216 // Store the return address to the appropriate stack slot. 2217 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2218 FPDiff, dl); 2219 } 2220 2221 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2222 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2223 // In the 64-bit large code model, we have to make all calls 2224 // through a register, since the call instruction's 32-bit 2225 // pc-relative offset may not be large enough to hold the whole 2226 // address. 2227 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2228 // If the callee is a GlobalAddress node (quite common, every direct call 2229 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2230 // it. 2231 2232 // We should use extra load for direct calls to dllimported functions in 2233 // non-JIT mode. 2234 const GlobalValue *GV = G->getGlobal(); 2235 if (!GV->hasDLLImportLinkage()) { 2236 unsigned char OpFlags = 0; 2237 bool ExtraLoad = false; 2238 unsigned WrapperKind = ISD::DELETED_NODE; 2239 2240 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2241 // external symbols most go through the PLT in PIC mode. If the symbol 2242 // has hidden or protected visibility, or if it is static or local, then 2243 // we don't need to use the PLT - we can directly call it. 2244 if (Subtarget->isTargetELF() && 2245 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2246 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2247 OpFlags = X86II::MO_PLT; 2248 } else if (Subtarget->isPICStyleStubAny() && 2249 (GV->isDeclaration() || GV->isWeakForLinker()) && 2250 (!Subtarget->getTargetTriple().isMacOSX() || 2251 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2252 // PC-relative references to external symbols should go through $stub, 2253 // unless we're building with the leopard linker or later, which 2254 // automatically synthesizes these stubs. 2255 OpFlags = X86II::MO_DARWIN_STUB; 2256 } else if (Subtarget->isPICStyleRIPRel() && 2257 isa<Function>(GV) && 2258 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { 2259 // If the function is marked as non-lazy, generate an indirect call 2260 // which loads from the GOT directly. This avoids runtime overhead 2261 // at the cost of eager binding (and one extra byte of encoding). 2262 OpFlags = X86II::MO_GOTPCREL; 2263 WrapperKind = X86ISD::WrapperRIP; 2264 ExtraLoad = true; 2265 } 2266 2267 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2268 G->getOffset(), OpFlags); 2269 2270 // Add a wrapper if needed. 2271 if (WrapperKind != ISD::DELETED_NODE) 2272 Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); 2273 // Add extra indirection if needed. 2274 if (ExtraLoad) 2275 Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, 2276 MachinePointerInfo::getGOT(), 2277 false, false, 0); 2278 } 2279 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2280 unsigned char OpFlags = 0; 2281 2282 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to 2283 // external symbols should go through the PLT. 2284 if (Subtarget->isTargetELF() && 2285 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2286 OpFlags = X86II::MO_PLT; 2287 } else if (Subtarget->isPICStyleStubAny() && 2288 (!Subtarget->getTargetTriple().isMacOSX() || 2289 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { 2290 // PC-relative references to external symbols should go through $stub, 2291 // unless we're building with the leopard linker or later, which 2292 // automatically synthesizes these stubs. 2293 OpFlags = X86II::MO_DARWIN_STUB; 2294 } 2295 2296 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2297 OpFlags); 2298 } 2299 2300 // Returns a chain & a flag for retval copy to use. 2301 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2302 SmallVector<SDValue, 8> Ops; 2303 2304 if (!IsSibcall && isTailCall) { 2305 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2306 DAG.getIntPtrConstant(0, true), InFlag); 2307 InFlag = Chain.getValue(1); 2308 } 2309 2310 Ops.push_back(Chain); 2311 Ops.push_back(Callee); 2312 2313 if (isTailCall) 2314 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2315 2316 // Add argument registers to the end of the list so that they are known live 2317 // into the call. 2318 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2319 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2320 RegsToPass[i].second.getValueType())); 2321 2322 // Add an implicit use GOT pointer in EBX. 2323 if (!isTailCall && Subtarget->isPICStyleGOT()) 2324 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2325 2326 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2327 if (Is64Bit && isVarArg && !IsWin64) 2328 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2329 2330 if (InFlag.getNode()) 2331 Ops.push_back(InFlag); 2332 2333 if (isTailCall) { 2334 // We used to do: 2335 //// If this is the first return lowered for this function, add the regs 2336 //// to the liveout set for the function. 2337 // This isn't right, although it's probably harmless on x86; liveouts 2338 // should be computed from returns not tail calls. Consider a void 2339 // function making a tail call to a function returning int. 2340 return DAG.getNode(X86ISD::TC_RETURN, dl, 2341 NodeTys, &Ops[0], Ops.size()); 2342 } 2343 2344 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2345 InFlag = Chain.getValue(1); 2346 2347 // Create the CALLSEQ_END node. 2348 unsigned NumBytesForCalleeToPush; 2349 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) 2350 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2351 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2352 // If this is a call to a struct-return function, the callee 2353 // pops the hidden struct pointer, so we have to push it back. 2354 // This is common for Darwin/X86, Linux & Mingw32 targets. 2355 NumBytesForCalleeToPush = 4; 2356 else 2357 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2358 2359 // Returns a flag for retval copy to use. 2360 if (!IsSibcall) { 2361 Chain = DAG.getCALLSEQ_END(Chain, 2362 DAG.getIntPtrConstant(NumBytes, true), 2363 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2364 true), 2365 InFlag); 2366 InFlag = Chain.getValue(1); 2367 } 2368 2369 // Handle result values, copying them out of physregs into vregs that we 2370 // return. 2371 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2372 Ins, dl, DAG, InVals); 2373} 2374 2375 2376//===----------------------------------------------------------------------===// 2377// Fast Calling Convention (tail call) implementation 2378//===----------------------------------------------------------------------===// 2379 2380// Like std call, callee cleans arguments, convention except that ECX is 2381// reserved for storing the tail called function address. Only 2 registers are 2382// free for argument passing (inreg). Tail call optimization is performed 2383// provided: 2384// * tailcallopt is enabled 2385// * caller/callee are fastcc 2386// On X86_64 architecture with GOT-style position independent code only local 2387// (within module) calls are supported at the moment. 2388// To keep the stack aligned according to platform abi the function 2389// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2390// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2391// If a tail called function callee has more arguments than the caller the 2392// caller needs to make sure that there is room to move the RETADDR to. This is 2393// achieved by reserving an area the size of the argument delta right after the 2394// original REtADDR, but before the saved framepointer or the spilled registers 2395// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2396// stack layout: 2397// arg1 2398// arg2 2399// RETADDR 2400// [ new RETADDR 2401// move area ] 2402// (possible EBP) 2403// ESI 2404// EDI 2405// local1 .. 2406 2407/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2408/// for a 16 byte align requirement. 2409unsigned 2410X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2411 SelectionDAG& DAG) const { 2412 MachineFunction &MF = DAG.getMachineFunction(); 2413 const TargetMachine &TM = MF.getTarget(); 2414 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 2415 unsigned StackAlignment = TFI.getStackAlignment(); 2416 uint64_t AlignMask = StackAlignment - 1; 2417 int64_t Offset = StackSize; 2418 uint64_t SlotSize = TD->getPointerSize(); 2419 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2420 // Number smaller than 12 so just add the difference. 2421 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2422 } else { 2423 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2424 Offset = ((~AlignMask) & Offset) + StackAlignment + 2425 (StackAlignment-SlotSize); 2426 } 2427 return Offset; 2428} 2429 2430/// MatchingStackOffset - Return true if the given stack call argument is 2431/// already available in the same position (relatively) of the caller's 2432/// incoming argument stack. 2433static 2434bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2435 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2436 const X86InstrInfo *TII) { 2437 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2438 int FI = INT_MAX; 2439 if (Arg.getOpcode() == ISD::CopyFromReg) { 2440 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2441 if (!TargetRegisterInfo::isVirtualRegister(VR)) 2442 return false; 2443 MachineInstr *Def = MRI->getVRegDef(VR); 2444 if (!Def) 2445 return false; 2446 if (!Flags.isByVal()) { 2447 if (!TII->isLoadFromStackSlot(Def, FI)) 2448 return false; 2449 } else { 2450 unsigned Opcode = Def->getOpcode(); 2451 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2452 Def->getOperand(1).isFI()) { 2453 FI = Def->getOperand(1).getIndex(); 2454 Bytes = Flags.getByValSize(); 2455 } else 2456 return false; 2457 } 2458 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2459 if (Flags.isByVal()) 2460 // ByVal argument is passed in as a pointer but it's now being 2461 // dereferenced. e.g. 2462 // define @foo(%struct.X* %A) { 2463 // tail call @bar(%struct.X* byval %A) 2464 // } 2465 return false; 2466 SDValue Ptr = Ld->getBasePtr(); 2467 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2468 if (!FINode) 2469 return false; 2470 FI = FINode->getIndex(); 2471 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { 2472 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); 2473 FI = FINode->getIndex(); 2474 Bytes = Flags.getByValSize(); 2475 } else 2476 return false; 2477 2478 assert(FI != INT_MAX); 2479 if (!MFI->isFixedObjectIndex(FI)) 2480 return false; 2481 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2482} 2483 2484/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2485/// for tail call optimization. Targets which want to do tail call 2486/// optimization should implement this function. 2487bool 2488X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2489 CallingConv::ID CalleeCC, 2490 bool isVarArg, 2491 bool isCalleeStructRet, 2492 bool isCallerStructRet, 2493 const SmallVectorImpl<ISD::OutputArg> &Outs, 2494 const SmallVectorImpl<SDValue> &OutVals, 2495 const SmallVectorImpl<ISD::InputArg> &Ins, 2496 SelectionDAG& DAG) const { 2497 if (!IsTailCallConvention(CalleeCC) && 2498 CalleeCC != CallingConv::C) 2499 return false; 2500 2501 // If -tailcallopt is specified, make fastcc functions tail-callable. 2502 const MachineFunction &MF = DAG.getMachineFunction(); 2503 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2504 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2505 bool CCMatch = CallerCC == CalleeCC; 2506 2507 if (GuaranteedTailCallOpt) { 2508 if (IsTailCallConvention(CalleeCC) && CCMatch) 2509 return true; 2510 return false; 2511 } 2512 2513 // Look for obvious safe cases to perform tail call optimization that do not 2514 // require ABI changes. This is what gcc calls sibcall. 2515 2516 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2517 // emit a special epilogue. 2518 if (RegInfo->needsStackRealignment(MF)) 2519 return false; 2520 2521 // Also avoid sibcall optimization if either caller or callee uses struct 2522 // return semantics. 2523 if (isCalleeStructRet || isCallerStructRet) 2524 return false; 2525 2526 // An stdcall caller is expected to clean up its arguments; the callee 2527 // isn't going to do that. 2528 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2529 return false; 2530 2531 // Do not sibcall optimize vararg calls unless all arguments are passed via 2532 // registers. 2533 if (isVarArg && !Outs.empty()) { 2534 2535 // Optimizing for varargs on Win64 is unlikely to be safe without 2536 // additional testing. 2537 if (Subtarget->isTargetWin64()) 2538 return false; 2539 2540 SmallVector<CCValAssign, 16> ArgLocs; 2541 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2542 getTargetMachine(), ArgLocs, *DAG.getContext()); 2543 2544 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2545 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2546 if (!ArgLocs[i].isRegLoc()) 2547 return false; 2548 } 2549 2550 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2551 // Therefore if it's not used by the call it is not safe to optimize this into 2552 // a sibcall. 2553 bool Unused = false; 2554 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2555 if (!Ins[i].Used) { 2556 Unused = true; 2557 break; 2558 } 2559 } 2560 if (Unused) { 2561 SmallVector<CCValAssign, 16> RVLocs; 2562 CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), 2563 getTargetMachine(), RVLocs, *DAG.getContext()); 2564 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2565 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2566 CCValAssign &VA = RVLocs[i]; 2567 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2568 return false; 2569 } 2570 } 2571 2572 // If the calling conventions do not match, then we'd better make sure the 2573 // results are returned in the same way as what the caller expects. 2574 if (!CCMatch) { 2575 SmallVector<CCValAssign, 16> RVLocs1; 2576 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), 2577 getTargetMachine(), RVLocs1, *DAG.getContext()); 2578 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2579 2580 SmallVector<CCValAssign, 16> RVLocs2; 2581 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), 2582 getTargetMachine(), RVLocs2, *DAG.getContext()); 2583 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2584 2585 if (RVLocs1.size() != RVLocs2.size()) 2586 return false; 2587 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2588 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2589 return false; 2590 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2591 return false; 2592 if (RVLocs1[i].isRegLoc()) { 2593 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2594 return false; 2595 } else { 2596 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2597 return false; 2598 } 2599 } 2600 } 2601 2602 // If the callee takes no arguments then go on to check the results of the 2603 // call. 2604 if (!Outs.empty()) { 2605 // Check if stack adjustment is needed. For now, do not do this if any 2606 // argument is passed on the stack. 2607 SmallVector<CCValAssign, 16> ArgLocs; 2608 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), 2609 getTargetMachine(), ArgLocs, *DAG.getContext()); 2610 2611 // Allocate shadow area for Win64 2612 if (Subtarget->isTargetWin64()) { 2613 CCInfo.AllocateStack(32, 8); 2614 } 2615 2616 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2617 if (CCInfo.getNextStackOffset()) { 2618 MachineFunction &MF = DAG.getMachineFunction(); 2619 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2620 return false; 2621 2622 // Check if the arguments are already laid out in the right way as 2623 // the caller's fixed stack objects. 2624 MachineFrameInfo *MFI = MF.getFrameInfo(); 2625 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2626 const X86InstrInfo *TII = 2627 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2628 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2629 CCValAssign &VA = ArgLocs[i]; 2630 SDValue Arg = OutVals[i]; 2631 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2632 if (VA.getLocInfo() == CCValAssign::Indirect) 2633 return false; 2634 if (!VA.isRegLoc()) { 2635 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2636 MFI, MRI, TII)) 2637 return false; 2638 } 2639 } 2640 } 2641 2642 // If the tailcall address may be in a register, then make sure it's 2643 // possible to register allocate for it. In 32-bit, the call address can 2644 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2645 // callee-saved registers are restored. These happen to be the same 2646 // registers used to pass 'inreg' arguments so watch out for those. 2647 if (!Subtarget->is64Bit() && 2648 !isa<GlobalAddressSDNode>(Callee) && 2649 !isa<ExternalSymbolSDNode>(Callee)) { 2650 unsigned NumInRegs = 0; 2651 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2652 CCValAssign &VA = ArgLocs[i]; 2653 if (!VA.isRegLoc()) 2654 continue; 2655 unsigned Reg = VA.getLocReg(); 2656 switch (Reg) { 2657 default: break; 2658 case X86::EAX: case X86::EDX: case X86::ECX: 2659 if (++NumInRegs == 3) 2660 return false; 2661 break; 2662 } 2663 } 2664 } 2665 } 2666 2667 return true; 2668} 2669 2670FastISel * 2671X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2672 return X86::createFastISel(funcInfo); 2673} 2674 2675 2676//===----------------------------------------------------------------------===// 2677// Other Lowering Hooks 2678//===----------------------------------------------------------------------===// 2679 2680static bool MayFoldLoad(SDValue Op) { 2681 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2682} 2683 2684static bool MayFoldIntoStore(SDValue Op) { 2685 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2686} 2687 2688static bool isTargetShuffle(unsigned Opcode) { 2689 switch(Opcode) { 2690 default: return false; 2691 case X86ISD::PSHUFD: 2692 case X86ISD::PSHUFHW: 2693 case X86ISD::PSHUFLW: 2694 case X86ISD::SHUFPD: 2695 case X86ISD::PALIGN: 2696 case X86ISD::SHUFPS: 2697 case X86ISD::MOVLHPS: 2698 case X86ISD::MOVLHPD: 2699 case X86ISD::MOVHLPS: 2700 case X86ISD::MOVLPS: 2701 case X86ISD::MOVLPD: 2702 case X86ISD::MOVSHDUP: 2703 case X86ISD::MOVSLDUP: 2704 case X86ISD::MOVDDUP: 2705 case X86ISD::MOVSS: 2706 case X86ISD::MOVSD: 2707 case X86ISD::UNPCKLPS: 2708 case X86ISD::UNPCKLPD: 2709 case X86ISD::VUNPCKLPSY: 2710 case X86ISD::VUNPCKLPDY: 2711 case X86ISD::PUNPCKLWD: 2712 case X86ISD::PUNPCKLBW: 2713 case X86ISD::PUNPCKLDQ: 2714 case X86ISD::PUNPCKLQDQ: 2715 case X86ISD::UNPCKHPS: 2716 case X86ISD::UNPCKHPD: 2717 case X86ISD::VUNPCKHPSY: 2718 case X86ISD::VUNPCKHPDY: 2719 case X86ISD::PUNPCKHWD: 2720 case X86ISD::PUNPCKHBW: 2721 case X86ISD::PUNPCKHDQ: 2722 case X86ISD::PUNPCKHQDQ: 2723 case X86ISD::VPERMILPS: 2724 case X86ISD::VPERMILPSY: 2725 case X86ISD::VPERMILPD: 2726 case X86ISD::VPERMILPDY: 2727 return true; 2728 } 2729 return false; 2730} 2731 2732static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2733 SDValue V1, SelectionDAG &DAG) { 2734 switch(Opc) { 2735 default: llvm_unreachable("Unknown x86 shuffle node"); 2736 case X86ISD::MOVSHDUP: 2737 case X86ISD::MOVSLDUP: 2738 case X86ISD::MOVDDUP: 2739 return DAG.getNode(Opc, dl, VT, V1); 2740 } 2741 2742 return SDValue(); 2743} 2744 2745static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2746 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2747 switch(Opc) { 2748 default: llvm_unreachable("Unknown x86 shuffle node"); 2749 case X86ISD::PSHUFD: 2750 case X86ISD::PSHUFHW: 2751 case X86ISD::PSHUFLW: 2752 case X86ISD::VPERMILPS: 2753 case X86ISD::VPERMILPSY: 2754 case X86ISD::VPERMILPD: 2755 case X86ISD::VPERMILPDY: 2756 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2757 } 2758 2759 return SDValue(); 2760} 2761 2762static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2763 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2764 switch(Opc) { 2765 default: llvm_unreachable("Unknown x86 shuffle node"); 2766 case X86ISD::PALIGN: 2767 case X86ISD::SHUFPD: 2768 case X86ISD::SHUFPS: 2769 return DAG.getNode(Opc, dl, VT, V1, V2, 2770 DAG.getConstant(TargetMask, MVT::i8)); 2771 } 2772 return SDValue(); 2773} 2774 2775static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2776 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2777 switch(Opc) { 2778 default: llvm_unreachable("Unknown x86 shuffle node"); 2779 case X86ISD::MOVLHPS: 2780 case X86ISD::MOVLHPD: 2781 case X86ISD::MOVHLPS: 2782 case X86ISD::MOVLPS: 2783 case X86ISD::MOVLPD: 2784 case X86ISD::MOVSS: 2785 case X86ISD::MOVSD: 2786 case X86ISD::UNPCKLPS: 2787 case X86ISD::UNPCKLPD: 2788 case X86ISD::VUNPCKLPSY: 2789 case X86ISD::VUNPCKLPDY: 2790 case X86ISD::PUNPCKLWD: 2791 case X86ISD::PUNPCKLBW: 2792 case X86ISD::PUNPCKLDQ: 2793 case X86ISD::PUNPCKLQDQ: 2794 case X86ISD::UNPCKHPS: 2795 case X86ISD::UNPCKHPD: 2796 case X86ISD::VUNPCKHPSY: 2797 case X86ISD::VUNPCKHPDY: 2798 case X86ISD::PUNPCKHWD: 2799 case X86ISD::PUNPCKHBW: 2800 case X86ISD::PUNPCKHDQ: 2801 case X86ISD::PUNPCKHQDQ: 2802 return DAG.getNode(Opc, dl, VT, V1, V2); 2803 } 2804 return SDValue(); 2805} 2806 2807SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2808 MachineFunction &MF = DAG.getMachineFunction(); 2809 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2810 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2811 2812 if (ReturnAddrIndex == 0) { 2813 // Set up a frame object for the return address. 2814 uint64_t SlotSize = TD->getPointerSize(); 2815 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2816 false); 2817 FuncInfo->setRAIndex(ReturnAddrIndex); 2818 } 2819 2820 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2821} 2822 2823 2824bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2825 bool hasSymbolicDisplacement) { 2826 // Offset should fit into 32 bit immediate field. 2827 if (!isInt<32>(Offset)) 2828 return false; 2829 2830 // If we don't have a symbolic displacement - we don't have any extra 2831 // restrictions. 2832 if (!hasSymbolicDisplacement) 2833 return true; 2834 2835 // FIXME: Some tweaks might be needed for medium code model. 2836 if (M != CodeModel::Small && M != CodeModel::Kernel) 2837 return false; 2838 2839 // For small code model we assume that latest object is 16MB before end of 31 2840 // bits boundary. We may also accept pretty large negative constants knowing 2841 // that all objects are in the positive half of address space. 2842 if (M == CodeModel::Small && Offset < 16*1024*1024) 2843 return true; 2844 2845 // For kernel code model we know that all object resist in the negative half 2846 // of 32bits address space. We may not accept negative offsets, since they may 2847 // be just off and we may accept pretty large positive ones. 2848 if (M == CodeModel::Kernel && Offset > 0) 2849 return true; 2850 2851 return false; 2852} 2853 2854/// isCalleePop - Determines whether the callee is required to pop its 2855/// own arguments. Callee pop is necessary to support tail calls. 2856bool X86::isCalleePop(CallingConv::ID CallingConv, 2857 bool is64Bit, bool IsVarArg, bool TailCallOpt) { 2858 if (IsVarArg) 2859 return false; 2860 2861 switch (CallingConv) { 2862 default: 2863 return false; 2864 case CallingConv::X86_StdCall: 2865 return !is64Bit; 2866 case CallingConv::X86_FastCall: 2867 return !is64Bit; 2868 case CallingConv::X86_ThisCall: 2869 return !is64Bit; 2870 case CallingConv::Fast: 2871 return TailCallOpt; 2872 case CallingConv::GHC: 2873 return TailCallOpt; 2874 } 2875} 2876 2877/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2878/// specific condition code, returning the condition code and the LHS/RHS of the 2879/// comparison to make. 2880static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2881 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2882 if (!isFP) { 2883 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2884 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2885 // X > -1 -> X == 0, jump !sign. 2886 RHS = DAG.getConstant(0, RHS.getValueType()); 2887 return X86::COND_NS; 2888 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2889 // X < 0 -> X == 0, jump on sign. 2890 return X86::COND_S; 2891 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2892 // X < 1 -> X <= 0 2893 RHS = DAG.getConstant(0, RHS.getValueType()); 2894 return X86::COND_LE; 2895 } 2896 } 2897 2898 switch (SetCCOpcode) { 2899 default: llvm_unreachable("Invalid integer condition!"); 2900 case ISD::SETEQ: return X86::COND_E; 2901 case ISD::SETGT: return X86::COND_G; 2902 case ISD::SETGE: return X86::COND_GE; 2903 case ISD::SETLT: return X86::COND_L; 2904 case ISD::SETLE: return X86::COND_LE; 2905 case ISD::SETNE: return X86::COND_NE; 2906 case ISD::SETULT: return X86::COND_B; 2907 case ISD::SETUGT: return X86::COND_A; 2908 case ISD::SETULE: return X86::COND_BE; 2909 case ISD::SETUGE: return X86::COND_AE; 2910 } 2911 } 2912 2913 // First determine if it is required or is profitable to flip the operands. 2914 2915 // If LHS is a foldable load, but RHS is not, flip the condition. 2916 if (ISD::isNON_EXTLoad(LHS.getNode()) && 2917 !ISD::isNON_EXTLoad(RHS.getNode())) { 2918 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2919 std::swap(LHS, RHS); 2920 } 2921 2922 switch (SetCCOpcode) { 2923 default: break; 2924 case ISD::SETOLT: 2925 case ISD::SETOLE: 2926 case ISD::SETUGT: 2927 case ISD::SETUGE: 2928 std::swap(LHS, RHS); 2929 break; 2930 } 2931 2932 // On a floating point condition, the flags are set as follows: 2933 // ZF PF CF op 2934 // 0 | 0 | 0 | X > Y 2935 // 0 | 0 | 1 | X < Y 2936 // 1 | 0 | 0 | X == Y 2937 // 1 | 1 | 1 | unordered 2938 switch (SetCCOpcode) { 2939 default: llvm_unreachable("Condcode should be pre-legalized away"); 2940 case ISD::SETUEQ: 2941 case ISD::SETEQ: return X86::COND_E; 2942 case ISD::SETOLT: // flipped 2943 case ISD::SETOGT: 2944 case ISD::SETGT: return X86::COND_A; 2945 case ISD::SETOLE: // flipped 2946 case ISD::SETOGE: 2947 case ISD::SETGE: return X86::COND_AE; 2948 case ISD::SETUGT: // flipped 2949 case ISD::SETULT: 2950 case ISD::SETLT: return X86::COND_B; 2951 case ISD::SETUGE: // flipped 2952 case ISD::SETULE: 2953 case ISD::SETLE: return X86::COND_BE; 2954 case ISD::SETONE: 2955 case ISD::SETNE: return X86::COND_NE; 2956 case ISD::SETUO: return X86::COND_P; 2957 case ISD::SETO: return X86::COND_NP; 2958 case ISD::SETOEQ: 2959 case ISD::SETUNE: return X86::COND_INVALID; 2960 } 2961} 2962 2963/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2964/// code. Current x86 isa includes the following FP cmov instructions: 2965/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2966static bool hasFPCMov(unsigned X86CC) { 2967 switch (X86CC) { 2968 default: 2969 return false; 2970 case X86::COND_B: 2971 case X86::COND_BE: 2972 case X86::COND_E: 2973 case X86::COND_P: 2974 case X86::COND_A: 2975 case X86::COND_AE: 2976 case X86::COND_NE: 2977 case X86::COND_NP: 2978 return true; 2979 } 2980} 2981 2982/// isFPImmLegal - Returns true if the target can instruction select the 2983/// specified FP immediate natively. If false, the legalizer will 2984/// materialize the FP immediate as a load from a constant pool. 2985bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2986 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2987 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2988 return true; 2989 } 2990 return false; 2991} 2992 2993/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2994/// the specified range (L, H]. 2995static bool isUndefOrInRange(int Val, int Low, int Hi) { 2996 return (Val < 0) || (Val >= Low && Val < Hi); 2997} 2998 2999/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 3000/// specified value. 3001static bool isUndefOrEqual(int Val, int CmpVal) { 3002 if (Val < 0 || Val == CmpVal) 3003 return true; 3004 return false; 3005} 3006 3007/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 3008/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 3009/// the second operand. 3010static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3011 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 3012 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 3013 if (VT == MVT::v2f64 || VT == MVT::v2i64) 3014 return (Mask[0] < 2 && Mask[1] < 2); 3015 return false; 3016} 3017 3018bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 3019 SmallVector<int, 8> M; 3020 N->getMask(M); 3021 return ::isPSHUFDMask(M, N->getValueType(0)); 3022} 3023 3024/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 3025/// is suitable for input to PSHUFHW. 3026static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3027 if (VT != MVT::v8i16) 3028 return false; 3029 3030 // Lower quadword copied in order or undef. 3031 for (int i = 0; i != 4; ++i) 3032 if (Mask[i] >= 0 && Mask[i] != i) 3033 return false; 3034 3035 // Upper quadword shuffled. 3036 for (int i = 4; i != 8; ++i) 3037 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 3038 return false; 3039 3040 return true; 3041} 3042 3043bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 3044 SmallVector<int, 8> M; 3045 N->getMask(M); 3046 return ::isPSHUFHWMask(M, N->getValueType(0)); 3047} 3048 3049/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 3050/// is suitable for input to PSHUFLW. 3051static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3052 if (VT != MVT::v8i16) 3053 return false; 3054 3055 // Upper quadword copied in order. 3056 for (int i = 4; i != 8; ++i) 3057 if (Mask[i] >= 0 && Mask[i] != i) 3058 return false; 3059 3060 // Lower quadword shuffled. 3061 for (int i = 0; i != 4; ++i) 3062 if (Mask[i] >= 4) 3063 return false; 3064 3065 return true; 3066} 3067 3068bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 3069 SmallVector<int, 8> M; 3070 N->getMask(M); 3071 return ::isPSHUFLWMask(M, N->getValueType(0)); 3072} 3073 3074/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 3075/// is suitable for input to PALIGNR. 3076static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 3077 bool hasSSSE3) { 3078 int i, e = VT.getVectorNumElements(); 3079 3080 // Do not handle v2i64 / v2f64 shuffles with palignr. 3081 if (e < 4 || !hasSSSE3) 3082 return false; 3083 3084 for (i = 0; i != e; ++i) 3085 if (Mask[i] >= 0) 3086 break; 3087 3088 // All undef, not a palignr. 3089 if (i == e) 3090 return false; 3091 3092 // Make sure we're shifting in the right direction. 3093 if (Mask[i] <= i) 3094 return false; 3095 3096 int s = Mask[i] - i; 3097 3098 // Check the rest of the elements to see if they are consecutive. 3099 for (++i; i != e; ++i) { 3100 int m = Mask[i]; 3101 if (m >= 0 && m != s+i) 3102 return false; 3103 } 3104 return true; 3105} 3106 3107bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 3108 SmallVector<int, 8> M; 3109 N->getMask(M); 3110 return ::isPALIGNRMask(M, N->getValueType(0), true); 3111} 3112 3113/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 3114/// specifies a shuffle of elements that is suitable for input to SHUFP*. 3115static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3116 int NumElems = VT.getVectorNumElements(); 3117 if (NumElems != 2 && NumElems != 4) 3118 return false; 3119 3120 int Half = NumElems / 2; 3121 for (int i = 0; i < Half; ++i) 3122 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3123 return false; 3124 for (int i = Half; i < NumElems; ++i) 3125 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3126 return false; 3127 3128 return true; 3129} 3130 3131bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3132 SmallVector<int, 8> M; 3133 N->getMask(M); 3134 return ::isSHUFPMask(M, N->getValueType(0)); 3135} 3136 3137/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3138/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3139/// half elements to come from vector 1 (which would equal the dest.) and 3140/// the upper half to come from vector 2. 3141static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3142 int NumElems = VT.getVectorNumElements(); 3143 3144 if (NumElems != 2 && NumElems != 4) 3145 return false; 3146 3147 int Half = NumElems / 2; 3148 for (int i = 0; i < Half; ++i) 3149 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3150 return false; 3151 for (int i = Half; i < NumElems; ++i) 3152 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3153 return false; 3154 return true; 3155} 3156 3157static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3158 SmallVector<int, 8> M; 3159 N->getMask(M); 3160 return isCommutedSHUFPMask(M, N->getValueType(0)); 3161} 3162 3163/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3164/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3165bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3166 if (N->getValueType(0).getVectorNumElements() != 4) 3167 return false; 3168 3169 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3170 return isUndefOrEqual(N->getMaskElt(0), 6) && 3171 isUndefOrEqual(N->getMaskElt(1), 7) && 3172 isUndefOrEqual(N->getMaskElt(2), 2) && 3173 isUndefOrEqual(N->getMaskElt(3), 3); 3174} 3175 3176/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3177/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3178/// <2, 3, 2, 3> 3179bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3180 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3181 3182 if (NumElems != 4) 3183 return false; 3184 3185 return isUndefOrEqual(N->getMaskElt(0), 2) && 3186 isUndefOrEqual(N->getMaskElt(1), 3) && 3187 isUndefOrEqual(N->getMaskElt(2), 2) && 3188 isUndefOrEqual(N->getMaskElt(3), 3); 3189} 3190 3191/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3192/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3193bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3194 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3195 3196 if (NumElems != 2 && NumElems != 4) 3197 return false; 3198 3199 for (unsigned i = 0; i < NumElems/2; ++i) 3200 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3201 return false; 3202 3203 for (unsigned i = NumElems/2; i < NumElems; ++i) 3204 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3205 return false; 3206 3207 return true; 3208} 3209 3210/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3211/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3212bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3213 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3214 3215 if ((NumElems != 2 && NumElems != 4) 3216 || N->getValueType(0).getSizeInBits() > 128) 3217 return false; 3218 3219 for (unsigned i = 0; i < NumElems/2; ++i) 3220 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3221 return false; 3222 3223 for (unsigned i = 0; i < NumElems/2; ++i) 3224 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3225 return false; 3226 3227 return true; 3228} 3229 3230/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3231/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3232static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3233 bool V2IsSplat = false) { 3234 int NumElts = VT.getVectorNumElements(); 3235 3236 assert((VT.is128BitVector() || VT.is256BitVector()) && 3237 "Unsupported vector type for unpckh"); 3238 3239 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3240 return false; 3241 3242 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3243 // independently on 128-bit lanes. 3244 unsigned NumLanes = VT.getSizeInBits()/128; 3245 unsigned NumLaneElts = NumElts/NumLanes; 3246 3247 unsigned Start = 0; 3248 unsigned End = NumLaneElts; 3249 for (unsigned s = 0; s < NumLanes; ++s) { 3250 for (unsigned i = Start, j = s * NumLaneElts; 3251 i != End; 3252 i += 2, ++j) { 3253 int BitI = Mask[i]; 3254 int BitI1 = Mask[i+1]; 3255 if (!isUndefOrEqual(BitI, j)) 3256 return false; 3257 if (V2IsSplat) { 3258 if (!isUndefOrEqual(BitI1, NumElts)) 3259 return false; 3260 } else { 3261 if (!isUndefOrEqual(BitI1, j + NumElts)) 3262 return false; 3263 } 3264 } 3265 // Process the next 128 bits. 3266 Start += NumLaneElts; 3267 End += NumLaneElts; 3268 } 3269 3270 return true; 3271} 3272 3273bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3274 SmallVector<int, 8> M; 3275 N->getMask(M); 3276 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3277} 3278 3279/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3280/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3281static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3282 bool V2IsSplat = false) { 3283 int NumElts = VT.getVectorNumElements(); 3284 3285 assert((VT.is128BitVector() || VT.is256BitVector()) && 3286 "Unsupported vector type for unpckh"); 3287 3288 if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) 3289 return false; 3290 3291 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3292 // independently on 128-bit lanes. 3293 unsigned NumLanes = VT.getSizeInBits()/128; 3294 unsigned NumLaneElts = NumElts/NumLanes; 3295 3296 unsigned Start = 0; 3297 unsigned End = NumLaneElts; 3298 for (unsigned l = 0; l != NumLanes; ++l) { 3299 for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; 3300 i != End; i += 2, ++j) { 3301 int BitI = Mask[i]; 3302 int BitI1 = Mask[i+1]; 3303 if (!isUndefOrEqual(BitI, j)) 3304 return false; 3305 if (V2IsSplat) { 3306 if (isUndefOrEqual(BitI1, NumElts)) 3307 return false; 3308 } else { 3309 if (!isUndefOrEqual(BitI1, j+NumElts)) 3310 return false; 3311 } 3312 } 3313 // Process the next 128 bits. 3314 Start += NumLaneElts; 3315 End += NumLaneElts; 3316 } 3317 return true; 3318} 3319 3320bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3321 SmallVector<int, 8> M; 3322 N->getMask(M); 3323 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3324} 3325 3326/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3327/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3328/// <0, 0, 1, 1> 3329static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3330 int NumElems = VT.getVectorNumElements(); 3331 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3332 return false; 3333 3334 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate 3335 // independently on 128-bit lanes. 3336 unsigned NumLanes = VT.getSizeInBits() / 128; 3337 unsigned NumLaneElts = NumElems / NumLanes; 3338 3339 for (unsigned s = 0; s < NumLanes; ++s) { 3340 for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; 3341 i != NumLaneElts * (s + 1); 3342 i += 2, ++j) { 3343 int BitI = Mask[i]; 3344 int BitI1 = Mask[i+1]; 3345 3346 if (!isUndefOrEqual(BitI, j)) 3347 return false; 3348 if (!isUndefOrEqual(BitI1, j)) 3349 return false; 3350 } 3351 } 3352 3353 return true; 3354} 3355 3356bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3357 SmallVector<int, 8> M; 3358 N->getMask(M); 3359 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3360} 3361 3362/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3363/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3364/// <2, 2, 3, 3> 3365static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3366 int NumElems = VT.getVectorNumElements(); 3367 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3368 return false; 3369 3370 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3371 int BitI = Mask[i]; 3372 int BitI1 = Mask[i+1]; 3373 if (!isUndefOrEqual(BitI, j)) 3374 return false; 3375 if (!isUndefOrEqual(BitI1, j)) 3376 return false; 3377 } 3378 return true; 3379} 3380 3381bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3382 SmallVector<int, 8> M; 3383 N->getMask(M); 3384 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3385} 3386 3387/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3388/// specifies a shuffle of elements that is suitable for input to MOVSS, 3389/// MOVSD, and MOVD, i.e. setting the lowest element. 3390static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3391 if (VT.getVectorElementType().getSizeInBits() < 32) 3392 return false; 3393 3394 int NumElts = VT.getVectorNumElements(); 3395 3396 if (!isUndefOrEqual(Mask[0], NumElts)) 3397 return false; 3398 3399 for (int i = 1; i < NumElts; ++i) 3400 if (!isUndefOrEqual(Mask[i], i)) 3401 return false; 3402 3403 return true; 3404} 3405 3406bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3407 SmallVector<int, 8> M; 3408 N->getMask(M); 3409 return ::isMOVLMask(M, N->getValueType(0)); 3410} 3411 3412/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand 3413/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. 3414/// Note that VPERMIL mask matching is different depending whether theunderlying 3415/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3416/// to the same elements of the low, but to the higher half of the source. 3417/// In VPERMILPD the two lanes could be shuffled independently of each other 3418/// with the same restriction that lanes can't be crossed. 3419static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, 3420 const X86Subtarget *Subtarget) { 3421 int NumElts = VT.getVectorNumElements(); 3422 int NumLanes = VT.getSizeInBits()/128; 3423 3424 if (!Subtarget->hasAVX()) 3425 return false; 3426 3427 // Match any permutation of 128-bit vector with 64-bit types 3428 if (NumLanes == 1 && NumElts != 2) 3429 return false; 3430 3431 // Only match 256-bit with 32 types 3432 if (VT.getSizeInBits() == 256 && NumElts != 4) 3433 return false; 3434 3435 // The mask on the high lane is independent of the low. Both can match 3436 // any element in inside its own lane, but can't cross. 3437 int LaneSize = NumElts/NumLanes; 3438 for (int l = 0; l < NumLanes; ++l) 3439 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { 3440 int LaneStart = l*LaneSize; 3441 if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) 3442 return false; 3443 } 3444 3445 return true; 3446} 3447 3448/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand 3449/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. 3450/// Note that VPERMIL mask matching is different depending whether theunderlying 3451/// type is 32 or 64. In the VPERMILPS the high half of the mask should point 3452/// to the same elements of the low, but to the higher half of the source. 3453/// In VPERMILPD the two lanes could be shuffled independently of each other 3454/// with the same restriction that lanes can't be crossed. 3455static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, 3456 const X86Subtarget *Subtarget) { 3457 unsigned NumElts = VT.getVectorNumElements(); 3458 unsigned NumLanes = VT.getSizeInBits()/128; 3459 3460 if (!Subtarget->hasAVX()) 3461 return false; 3462 3463 // Match any permutation of 128-bit vector with 32-bit types 3464 if (NumLanes == 1 && NumElts != 4) 3465 return false; 3466 3467 // Only match 256-bit with 32 types 3468 if (VT.getSizeInBits() == 256 && NumElts != 8) 3469 return false; 3470 3471 // The mask on the high lane should be the same as the low. Actually, 3472 // they can differ if any of the corresponding index in a lane is undef. 3473 int LaneSize = NumElts/NumLanes; 3474 for (int i = 0; i < LaneSize; ++i) { 3475 int HighElt = i+LaneSize; 3476 if (Mask[i] < 0 || Mask[HighElt] < 0) 3477 continue; 3478 if (Mask[HighElt]-Mask[i] != LaneSize) 3479 return false; 3480 } 3481 3482 return true; 3483} 3484 3485/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle 3486/// the specified VECTOR_MASK mask with VPERMILPS* instructions. 3487static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { 3488 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3489 EVT VT = SVOp->getValueType(0); 3490 3491 int NumElts = VT.getVectorNumElements(); 3492 int NumLanes = VT.getSizeInBits()/128; 3493 3494 unsigned Mask = 0; 3495 for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i) 3496 Mask |= SVOp->getMaskElt(i) << (i*2); 3497 3498 return Mask; 3499} 3500 3501/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle 3502/// the specified VECTOR_MASK mask with VPERMILPD* instructions. 3503static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { 3504 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3505 EVT VT = SVOp->getValueType(0); 3506 3507 int NumElts = VT.getVectorNumElements(); 3508 int NumLanes = VT.getSizeInBits()/128; 3509 3510 unsigned Mask = 0; 3511 int LaneSize = NumElts/NumLanes; 3512 for (int l = 0; l < NumLanes; ++l) 3513 for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) 3514 Mask |= (SVOp->getMaskElt(i)-l*LaneSize) << i; 3515 3516 return Mask; 3517} 3518 3519/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3520/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3521/// element of vector 2 and the other elements to come from vector 1 in order. 3522static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3523 bool V2IsSplat = false, bool V2IsUndef = false) { 3524 int NumOps = VT.getVectorNumElements(); 3525 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3526 return false; 3527 3528 if (!isUndefOrEqual(Mask[0], 0)) 3529 return false; 3530 3531 for (int i = 1; i < NumOps; ++i) 3532 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3533 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3534 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3535 return false; 3536 3537 return true; 3538} 3539 3540static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3541 bool V2IsUndef = false) { 3542 SmallVector<int, 8> M; 3543 N->getMask(M); 3544 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3545} 3546 3547/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3548/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3549/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> 3550bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, 3551 const X86Subtarget *Subtarget) { 3552 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3553 return false; 3554 3555 // The second vector must be undef 3556 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3557 return false; 3558 3559 EVT VT = N->getValueType(0); 3560 unsigned NumElems = VT.getVectorNumElements(); 3561 3562 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3563 (VT.getSizeInBits() == 256 && NumElems != 8)) 3564 return false; 3565 3566 // "i+1" is the value the indexed mask element must have 3567 for (unsigned i = 0; i < NumElems; i += 2) 3568 if (!isUndefOrEqual(N->getMaskElt(i), i+1) || 3569 !isUndefOrEqual(N->getMaskElt(i+1), i+1)) 3570 return false; 3571 3572 return true; 3573} 3574 3575/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3576/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3577/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> 3578bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, 3579 const X86Subtarget *Subtarget) { 3580 if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) 3581 return false; 3582 3583 // The second vector must be undef 3584 if (N->getOperand(1).getOpcode() != ISD::UNDEF) 3585 return false; 3586 3587 EVT VT = N->getValueType(0); 3588 unsigned NumElems = VT.getVectorNumElements(); 3589 3590 if ((VT.getSizeInBits() == 128 && NumElems != 4) || 3591 (VT.getSizeInBits() == 256 && NumElems != 8)) 3592 return false; 3593 3594 // "i" is the value the indexed mask element must have 3595 for (unsigned i = 0; i < NumElems; i += 2) 3596 if (!isUndefOrEqual(N->getMaskElt(i), i) || 3597 !isUndefOrEqual(N->getMaskElt(i+1), i)) 3598 return false; 3599 3600 return true; 3601} 3602 3603/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3604/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3605bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3606 int e = N->getValueType(0).getVectorNumElements() / 2; 3607 3608 for (int i = 0; i < e; ++i) 3609 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3610 return false; 3611 for (int i = 0; i < e; ++i) 3612 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3613 return false; 3614 return true; 3615} 3616 3617/// isVEXTRACTF128Index - Return true if the specified 3618/// EXTRACT_SUBVECTOR operand specifies a vector extract that is 3619/// suitable for input to VEXTRACTF128. 3620bool X86::isVEXTRACTF128Index(SDNode *N) { 3621 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3622 return false; 3623 3624 // The index should be aligned on a 128-bit boundary. 3625 uint64_t Index = 3626 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3627 3628 unsigned VL = N->getValueType(0).getVectorNumElements(); 3629 unsigned VBits = N->getValueType(0).getSizeInBits(); 3630 unsigned ElSize = VBits / VL; 3631 bool Result = (Index * ElSize) % 128 == 0; 3632 3633 return Result; 3634} 3635 3636/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR 3637/// operand specifies a subvector insert that is suitable for input to 3638/// VINSERTF128. 3639bool X86::isVINSERTF128Index(SDNode *N) { 3640 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3641 return false; 3642 3643 // The index should be aligned on a 128-bit boundary. 3644 uint64_t Index = 3645 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3646 3647 unsigned VL = N->getValueType(0).getVectorNumElements(); 3648 unsigned VBits = N->getValueType(0).getSizeInBits(); 3649 unsigned ElSize = VBits / VL; 3650 bool Result = (Index * ElSize) % 128 == 0; 3651 3652 return Result; 3653} 3654 3655/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3656/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3657unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3658 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3659 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3660 3661 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3662 unsigned Mask = 0; 3663 for (int i = 0; i < NumOperands; ++i) { 3664 int Val = SVOp->getMaskElt(NumOperands-i-1); 3665 if (Val < 0) Val = 0; 3666 if (Val >= NumOperands) Val -= NumOperands; 3667 Mask |= Val; 3668 if (i != NumOperands - 1) 3669 Mask <<= Shift; 3670 } 3671 return Mask; 3672} 3673 3674/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3675/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3676unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3677 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3678 unsigned Mask = 0; 3679 // 8 nodes, but we only care about the last 4. 3680 for (unsigned i = 7; i >= 4; --i) { 3681 int Val = SVOp->getMaskElt(i); 3682 if (Val >= 0) 3683 Mask |= (Val - 4); 3684 if (i != 4) 3685 Mask <<= 2; 3686 } 3687 return Mask; 3688} 3689 3690/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3691/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3692unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3693 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3694 unsigned Mask = 0; 3695 // 8 nodes, but we only care about the first 4. 3696 for (int i = 3; i >= 0; --i) { 3697 int Val = SVOp->getMaskElt(i); 3698 if (Val >= 0) 3699 Mask |= Val; 3700 if (i != 0) 3701 Mask <<= 2; 3702 } 3703 return Mask; 3704} 3705 3706/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3707/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3708unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3709 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3710 EVT VVT = N->getValueType(0); 3711 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3712 int Val = 0; 3713 3714 unsigned i, e; 3715 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3716 Val = SVOp->getMaskElt(i); 3717 if (Val >= 0) 3718 break; 3719 } 3720 assert(Val - i > 0 && "PALIGNR imm should be positive"); 3721 return (Val - i) * EltSize; 3722} 3723 3724/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate 3725/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 3726/// instructions. 3727unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { 3728 if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) 3729 llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); 3730 3731 uint64_t Index = 3732 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); 3733 3734 EVT VecVT = N->getOperand(0).getValueType(); 3735 EVT ElVT = VecVT.getVectorElementType(); 3736 3737 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3738 return Index / NumElemsPerChunk; 3739} 3740 3741/// getInsertVINSERTF128Immediate - Return the appropriate immediate 3742/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 3743/// instructions. 3744unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { 3745 if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) 3746 llvm_unreachable("Illegal insert subvector for VINSERTF128"); 3747 3748 uint64_t Index = 3749 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); 3750 3751 EVT VecVT = N->getValueType(0); 3752 EVT ElVT = VecVT.getVectorElementType(); 3753 3754 unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); 3755 return Index / NumElemsPerChunk; 3756} 3757 3758/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3759/// constant +0.0. 3760bool X86::isZeroNode(SDValue Elt) { 3761 return ((isa<ConstantSDNode>(Elt) && 3762 cast<ConstantSDNode>(Elt)->isNullValue()) || 3763 (isa<ConstantFPSDNode>(Elt) && 3764 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3765} 3766 3767/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3768/// their permute mask. 3769static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3770 SelectionDAG &DAG) { 3771 EVT VT = SVOp->getValueType(0); 3772 unsigned NumElems = VT.getVectorNumElements(); 3773 SmallVector<int, 8> MaskVec; 3774 3775 for (unsigned i = 0; i != NumElems; ++i) { 3776 int idx = SVOp->getMaskElt(i); 3777 if (idx < 0) 3778 MaskVec.push_back(idx); 3779 else if (idx < (int)NumElems) 3780 MaskVec.push_back(idx + NumElems); 3781 else 3782 MaskVec.push_back(idx - NumElems); 3783 } 3784 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3785 SVOp->getOperand(0), &MaskVec[0]); 3786} 3787 3788/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3789/// the two vector operands have swapped position. 3790static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3791 unsigned NumElems = VT.getVectorNumElements(); 3792 for (unsigned i = 0; i != NumElems; ++i) { 3793 int idx = Mask[i]; 3794 if (idx < 0) 3795 continue; 3796 else if (idx < (int)NumElems) 3797 Mask[i] = idx + NumElems; 3798 else 3799 Mask[i] = idx - NumElems; 3800 } 3801} 3802 3803/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3804/// match movhlps. The lower half elements should come from upper half of 3805/// V1 (and in order), and the upper half elements should come from the upper 3806/// half of V2 (and in order). 3807static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3808 if (Op->getValueType(0).getVectorNumElements() != 4) 3809 return false; 3810 for (unsigned i = 0, e = 2; i != e; ++i) 3811 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3812 return false; 3813 for (unsigned i = 2; i != 4; ++i) 3814 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3815 return false; 3816 return true; 3817} 3818 3819/// isScalarLoadToVector - Returns true if the node is a scalar load that 3820/// is promoted to a vector. It also returns the LoadSDNode by reference if 3821/// required. 3822static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3823 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3824 return false; 3825 N = N->getOperand(0).getNode(); 3826 if (!ISD::isNON_EXTLoad(N)) 3827 return false; 3828 if (LD) 3829 *LD = cast<LoadSDNode>(N); 3830 return true; 3831} 3832 3833/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3834/// match movlp{s|d}. The lower half elements should come from lower half of 3835/// V1 (and in order), and the upper half elements should come from the upper 3836/// half of V2 (and in order). And since V1 will become the source of the 3837/// MOVLP, it must be either a vector load or a scalar load to vector. 3838static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3839 ShuffleVectorSDNode *Op) { 3840 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3841 return false; 3842 // Is V2 is a vector load, don't do this transformation. We will try to use 3843 // load folding shufps op. 3844 if (ISD::isNON_EXTLoad(V2)) 3845 return false; 3846 3847 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3848 3849 if (NumElems != 2 && NumElems != 4) 3850 return false; 3851 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3852 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3853 return false; 3854 for (unsigned i = NumElems/2; i != NumElems; ++i) 3855 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3856 return false; 3857 return true; 3858} 3859 3860/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3861/// all the same. 3862static bool isSplatVector(SDNode *N) { 3863 if (N->getOpcode() != ISD::BUILD_VECTOR) 3864 return false; 3865 3866 SDValue SplatValue = N->getOperand(0); 3867 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3868 if (N->getOperand(i) != SplatValue) 3869 return false; 3870 return true; 3871} 3872 3873/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3874/// to an zero vector. 3875/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3876static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3877 SDValue V1 = N->getOperand(0); 3878 SDValue V2 = N->getOperand(1); 3879 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3880 for (unsigned i = 0; i != NumElems; ++i) { 3881 int Idx = N->getMaskElt(i); 3882 if (Idx >= (int)NumElems) { 3883 unsigned Opc = V2.getOpcode(); 3884 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3885 continue; 3886 if (Opc != ISD::BUILD_VECTOR || 3887 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3888 return false; 3889 } else if (Idx >= 0) { 3890 unsigned Opc = V1.getOpcode(); 3891 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3892 continue; 3893 if (Opc != ISD::BUILD_VECTOR || 3894 !X86::isZeroNode(V1.getOperand(Idx))) 3895 return false; 3896 } 3897 } 3898 return true; 3899} 3900 3901/// getZeroVector - Returns a vector of specified type with all zero elements. 3902/// 3903static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3904 DebugLoc dl) { 3905 assert(VT.isVector() && "Expected a vector type"); 3906 3907 // Always build SSE zero vectors as <4 x i32> bitcasted 3908 // to their dest type. This ensures they get CSE'd. 3909 SDValue Vec; 3910 if (VT.getSizeInBits() == 128) { // SSE 3911 if (HasSSE2) { // SSE2 3912 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3913 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3914 } else { // SSE1 3915 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3916 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3917 } 3918 } else if (VT.getSizeInBits() == 256) { // AVX 3919 // 256-bit logic and arithmetic instructions in AVX are 3920 // all floating-point, no support for integer ops. Default 3921 // to emitting fp zeroed vectors then. 3922 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3923 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3924 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3925 } 3926 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3927} 3928 3929/// getOnesVector - Returns a vector of specified type with all bits set. 3930/// Always build ones vectors as <4 x i32>. For 256-bit types, use two 3931/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their 3932/// original type, ensuring they get CSE'd. 3933static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3934 assert(VT.isVector() && "Expected a vector type"); 3935 assert((VT.is128BitVector() || VT.is256BitVector()) 3936 && "Expected a 128-bit or 256-bit vector type"); 3937 3938 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3939 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 3940 Cst, Cst, Cst, Cst); 3941 3942 if (VT.is256BitVector()) { 3943 SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), 3944 Vec, DAG.getConstant(0, MVT::i32), DAG, dl); 3945 Vec = Insert128BitVector(InsV, Vec, 3946 DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); 3947 } 3948 3949 return DAG.getNode(ISD::BITCAST, dl, VT, Vec); 3950} 3951 3952/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3953/// that point to V2 points to its first element. 3954static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3955 EVT VT = SVOp->getValueType(0); 3956 unsigned NumElems = VT.getVectorNumElements(); 3957 3958 bool Changed = false; 3959 SmallVector<int, 8> MaskVec; 3960 SVOp->getMask(MaskVec); 3961 3962 for (unsigned i = 0; i != NumElems; ++i) { 3963 if (MaskVec[i] > (int)NumElems) { 3964 MaskVec[i] = NumElems; 3965 Changed = true; 3966 } 3967 } 3968 if (Changed) 3969 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3970 SVOp->getOperand(1), &MaskVec[0]); 3971 return SDValue(SVOp, 0); 3972} 3973 3974/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3975/// operation of specified width. 3976static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3977 SDValue V2) { 3978 unsigned NumElems = VT.getVectorNumElements(); 3979 SmallVector<int, 8> Mask; 3980 Mask.push_back(NumElems); 3981 for (unsigned i = 1; i != NumElems; ++i) 3982 Mask.push_back(i); 3983 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3984} 3985 3986/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3987static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3988 SDValue V2) { 3989 unsigned NumElems = VT.getVectorNumElements(); 3990 SmallVector<int, 8> Mask; 3991 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3992 Mask.push_back(i); 3993 Mask.push_back(i + NumElems); 3994 } 3995 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3996} 3997 3998/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. 3999static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 4000 SDValue V2) { 4001 unsigned NumElems = VT.getVectorNumElements(); 4002 unsigned Half = NumElems/2; 4003 SmallVector<int, 8> Mask; 4004 for (unsigned i = 0; i != Half; ++i) { 4005 Mask.push_back(i + Half); 4006 Mask.push_back(i + NumElems + Half); 4007 } 4008 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 4009} 4010 4011// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by 4012// a generic shuffle instruction because the target has no such instructions. 4013// Generate shuffles which repeat i16 and i8 several times until they can be 4014// represented by v4f32 and then be manipulated by target suported shuffles. 4015static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) { 4016 EVT VT = V.getValueType(); 4017 int NumElems = VT.getVectorNumElements(); 4018 DebugLoc dl = V.getDebugLoc(); 4019 4020 while (NumElems > 4) { 4021 if (EltNo < NumElems/2) { 4022 V = getUnpackl(DAG, dl, VT, V, V); 4023 } else { 4024 V = getUnpackh(DAG, dl, VT, V, V); 4025 EltNo -= NumElems/2; 4026 } 4027 NumElems >>= 1; 4028 } 4029 return V; 4030} 4031 4032/// getLegalSplat - Generate a legal splat with supported x86 shuffles 4033static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { 4034 EVT VT = V.getValueType(); 4035 DebugLoc dl = V.getDebugLoc(); 4036 assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) 4037 && "Vector size not supported"); 4038 4039 bool Is128 = VT.getSizeInBits() == 128; 4040 EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32; 4041 V = DAG.getNode(ISD::BITCAST, dl, NVT, V); 4042 4043 if (Is128) { 4044 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 4045 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4046 } else { 4047 // The second half of indicies refer to the higher part, which is a 4048 // duplication of the lower one. This makes this shuffle a perfect match 4049 // for the VPERM instruction. 4050 int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, 4051 EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; 4052 V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]); 4053 } 4054 4055 return DAG.getNode(ISD::BITCAST, dl, VT, V); 4056} 4057 4058/// PromoteVectorToScalarSplat - Since there's no native support for 4059/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector + 4060/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the 4061/// shuffle before the insertion, this yields less instructions in the end. 4062static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV, 4063 SelectionDAG &DAG) { 4064 EVT SrcVT = SV->getValueType(0); 4065 SDValue V1 = SV->getOperand(0); 4066 DebugLoc dl = SV->getDebugLoc(); 4067 int NumElems = SrcVT.getVectorNumElements(); 4068 4069 assert(SrcVT.is256BitVector() && "unknown howto handle vector type"); 4070 4071 SmallVector<int, 4> Mask; 4072 for (int i = 0; i < NumElems/2; ++i) 4073 Mask.push_back(SV->getMaskElt(i)); 4074 4075 EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 4076 NumElems/2); 4077 SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1), 4078 DAG.getUNDEF(SVT), &Mask[0]); 4079 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1, 4080 DAG.getConstant(0, MVT::i32), DAG, dl); 4081 4082 return Insert128BitVector(InsV, SV1, 4083 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4084} 4085 4086/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and 4087/// v8i32, v16i16 or v32i8 to v8f32. 4088static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 4089 EVT SrcVT = SV->getValueType(0); 4090 SDValue V1 = SV->getOperand(0); 4091 DebugLoc dl = SV->getDebugLoc(); 4092 4093 int EltNo = SV->getSplatIndex(); 4094 int NumElems = SrcVT.getVectorNumElements(); 4095 unsigned Size = SrcVT.getSizeInBits(); 4096 4097 // Extract the 128-bit part containing the splat element and update 4098 // the splat element index when it refers to the higher register. 4099 if (Size == 256) { 4100 unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; 4101 V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); 4102 if (Idx > 0) 4103 EltNo -= NumElems/2; 4104 } 4105 4106 // Make this 128-bit vector duplicate i8 and i16 elements 4107 if (NumElems > 4) 4108 V1 = PromoteSplatv8v16(V1, DAG, EltNo); 4109 4110 // Recreate the 256-bit vector and place the same 128-bit vector 4111 // into the low and high part. This is necessary because we want 4112 // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles 4113 // inside each separate v4f32 lane. 4114 if (Size == 256) { 4115 SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, 4116 DAG.getConstant(0, MVT::i32), DAG, dl); 4117 V1 = Insert128BitVector(InsV, V1, 4118 DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); 4119 } 4120 4121 return getLegalSplat(DAG, V1, EltNo); 4122} 4123 4124/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 4125/// vector of zero or undef vector. This produces a shuffle where the low 4126/// element of V2 is swizzled into the zero/undef vector, landing at element 4127/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 4128static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 4129 bool isZero, bool HasSSE2, 4130 SelectionDAG &DAG) { 4131 EVT VT = V2.getValueType(); 4132 SDValue V1 = isZero 4133 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 4134 unsigned NumElems = VT.getVectorNumElements(); 4135 SmallVector<int, 16> MaskVec; 4136 for (unsigned i = 0; i != NumElems; ++i) 4137 // If this is the insertion idx, put the low elt of V2 here. 4138 MaskVec.push_back(i == Idx ? NumElems : i); 4139 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 4140} 4141 4142/// getShuffleScalarElt - Returns the scalar element that will make up the ith 4143/// element of the result of the vector shuffle. 4144static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 4145 unsigned Depth) { 4146 if (Depth == 6) 4147 return SDValue(); // Limit search depth. 4148 4149 SDValue V = SDValue(N, 0); 4150 EVT VT = V.getValueType(); 4151 unsigned Opcode = V.getOpcode(); 4152 4153 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 4154 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 4155 Index = SV->getMaskElt(Index); 4156 4157 if (Index < 0) 4158 return DAG.getUNDEF(VT.getVectorElementType()); 4159 4160 int NumElems = VT.getVectorNumElements(); 4161 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 4162 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 4163 } 4164 4165 // Recurse into target specific vector shuffles to find scalars. 4166 if (isTargetShuffle(Opcode)) { 4167 int NumElems = VT.getVectorNumElements(); 4168 SmallVector<unsigned, 16> ShuffleMask; 4169 SDValue ImmN; 4170 4171 switch(Opcode) { 4172 case X86ISD::SHUFPS: 4173 case X86ISD::SHUFPD: 4174 ImmN = N->getOperand(N->getNumOperands()-1); 4175 DecodeSHUFPSMask(NumElems, 4176 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4177 ShuffleMask); 4178 break; 4179 case X86ISD::PUNPCKHBW: 4180 case X86ISD::PUNPCKHWD: 4181 case X86ISD::PUNPCKHDQ: 4182 case X86ISD::PUNPCKHQDQ: 4183 DecodePUNPCKHMask(NumElems, ShuffleMask); 4184 break; 4185 case X86ISD::UNPCKHPS: 4186 case X86ISD::UNPCKHPD: 4187 case X86ISD::VUNPCKHPSY: 4188 case X86ISD::VUNPCKHPDY: 4189 DecodeUNPCKHPMask(NumElems, ShuffleMask); 4190 break; 4191 case X86ISD::PUNPCKLBW: 4192 case X86ISD::PUNPCKLWD: 4193 case X86ISD::PUNPCKLDQ: 4194 case X86ISD::PUNPCKLQDQ: 4195 DecodePUNPCKLMask(VT, ShuffleMask); 4196 break; 4197 case X86ISD::UNPCKLPS: 4198 case X86ISD::UNPCKLPD: 4199 case X86ISD::VUNPCKLPSY: 4200 case X86ISD::VUNPCKLPDY: 4201 DecodeUNPCKLPMask(VT, ShuffleMask); 4202 break; 4203 case X86ISD::MOVHLPS: 4204 DecodeMOVHLPSMask(NumElems, ShuffleMask); 4205 break; 4206 case X86ISD::MOVLHPS: 4207 DecodeMOVLHPSMask(NumElems, ShuffleMask); 4208 break; 4209 case X86ISD::PSHUFD: 4210 ImmN = N->getOperand(N->getNumOperands()-1); 4211 DecodePSHUFMask(NumElems, 4212 cast<ConstantSDNode>(ImmN)->getZExtValue(), 4213 ShuffleMask); 4214 break; 4215 case X86ISD::PSHUFHW: 4216 ImmN = N->getOperand(N->getNumOperands()-1); 4217 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4218 ShuffleMask); 4219 break; 4220 case X86ISD::PSHUFLW: 4221 ImmN = N->getOperand(N->getNumOperands()-1); 4222 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 4223 ShuffleMask); 4224 break; 4225 case X86ISD::MOVSS: 4226 case X86ISD::MOVSD: { 4227 // The index 0 always comes from the first element of the second source, 4228 // this is why MOVSS and MOVSD are used in the first place. The other 4229 // elements come from the other positions of the first source vector. 4230 unsigned OpNum = (Index == 0) ? 1 : 0; 4231 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 4232 Depth+1); 4233 } 4234 case X86ISD::VPERMILPS: 4235 case X86ISD::VPERMILPSY: 4236 // FIXME: Implement the other types 4237 ImmN = N->getOperand(N->getNumOperands()-1); 4238 DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), 4239 ShuffleMask); 4240 default: 4241 assert("not implemented for target shuffle node"); 4242 return SDValue(); 4243 } 4244 4245 Index = ShuffleMask[Index]; 4246 if (Index < 0) 4247 return DAG.getUNDEF(VT.getVectorElementType()); 4248 4249 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 4250 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 4251 Depth+1); 4252 } 4253 4254 // Actual nodes that may contain scalar elements 4255 if (Opcode == ISD::BITCAST) { 4256 V = V.getOperand(0); 4257 EVT SrcVT = V.getValueType(); 4258 unsigned NumElems = VT.getVectorNumElements(); 4259 4260 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 4261 return SDValue(); 4262 } 4263 4264 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 4265 return (Index == 0) ? V.getOperand(0) 4266 : DAG.getUNDEF(VT.getVectorElementType()); 4267 4268 if (V.getOpcode() == ISD::BUILD_VECTOR) 4269 return V.getOperand(Index); 4270 4271 return SDValue(); 4272} 4273 4274/// getNumOfConsecutiveZeros - Return the number of elements of a vector 4275/// shuffle operation which come from a consecutively from a zero. The 4276/// search can start in two different directions, from left or right. 4277static 4278unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 4279 bool ZerosFromLeft, SelectionDAG &DAG) { 4280 int i = 0; 4281 4282 while (i < NumElems) { 4283 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 4284 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 4285 if (!(Elt.getNode() && 4286 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 4287 break; 4288 ++i; 4289 } 4290 4291 return i; 4292} 4293 4294/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 4295/// MaskE correspond consecutively to elements from one of the vector operands, 4296/// starting from its index OpIdx. Also tell OpNum which source vector operand. 4297static 4298bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 4299 int OpIdx, int NumElems, unsigned &OpNum) { 4300 bool SeenV1 = false; 4301 bool SeenV2 = false; 4302 4303 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 4304 int Idx = SVOp->getMaskElt(i); 4305 // Ignore undef indicies 4306 if (Idx < 0) 4307 continue; 4308 4309 if (Idx < NumElems) 4310 SeenV1 = true; 4311 else 4312 SeenV2 = true; 4313 4314 // Only accept consecutive elements from the same vector 4315 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 4316 return false; 4317 } 4318 4319 OpNum = SeenV1 ? 0 : 1; 4320 return true; 4321} 4322 4323/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 4324/// logical left shift of a vector. 4325static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4326 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4327 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4328 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4329 false /* check zeros from right */, DAG); 4330 unsigned OpSrc; 4331 4332 if (!NumZeros) 4333 return false; 4334 4335 // Considering the elements in the mask that are not consecutive zeros, 4336 // check if they consecutively come from only one of the source vectors. 4337 // 4338 // V1 = {X, A, B, C} 0 4339 // \ \ \ / 4340 // vector_shuffle V1, V2 <1, 2, 3, X> 4341 // 4342 if (!isShuffleMaskConsecutive(SVOp, 4343 0, // Mask Start Index 4344 NumElems-NumZeros-1, // Mask End Index 4345 NumZeros, // Where to start looking in the src vector 4346 NumElems, // Number of elements in vector 4347 OpSrc)) // Which source operand ? 4348 return false; 4349 4350 isLeft = false; 4351 ShAmt = NumZeros; 4352 ShVal = SVOp->getOperand(OpSrc); 4353 return true; 4354} 4355 4356/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 4357/// logical left shift of a vector. 4358static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4359 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4360 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 4361 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 4362 true /* check zeros from left */, DAG); 4363 unsigned OpSrc; 4364 4365 if (!NumZeros) 4366 return false; 4367 4368 // Considering the elements in the mask that are not consecutive zeros, 4369 // check if they consecutively come from only one of the source vectors. 4370 // 4371 // 0 { A, B, X, X } = V2 4372 // / \ / / 4373 // vector_shuffle V1, V2 <X, X, 4, 5> 4374 // 4375 if (!isShuffleMaskConsecutive(SVOp, 4376 NumZeros, // Mask Start Index 4377 NumElems-1, // Mask End Index 4378 0, // Where to start looking in the src vector 4379 NumElems, // Number of elements in vector 4380 OpSrc)) // Which source operand ? 4381 return false; 4382 4383 isLeft = true; 4384 ShAmt = NumZeros; 4385 ShVal = SVOp->getOperand(OpSrc); 4386 return true; 4387} 4388 4389/// isVectorShift - Returns true if the shuffle can be implemented as a 4390/// logical left or right shift of a vector. 4391static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 4392 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 4393 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 4394 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 4395 return true; 4396 4397 return false; 4398} 4399 4400/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 4401/// 4402static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 4403 unsigned NumNonZero, unsigned NumZero, 4404 SelectionDAG &DAG, 4405 const TargetLowering &TLI) { 4406 if (NumNonZero > 8) 4407 return SDValue(); 4408 4409 DebugLoc dl = Op.getDebugLoc(); 4410 SDValue V(0, 0); 4411 bool First = true; 4412 for (unsigned i = 0; i < 16; ++i) { 4413 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 4414 if (ThisIsNonZero && First) { 4415 if (NumZero) 4416 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4417 else 4418 V = DAG.getUNDEF(MVT::v8i16); 4419 First = false; 4420 } 4421 4422 if ((i & 1) != 0) { 4423 SDValue ThisElt(0, 0), LastElt(0, 0); 4424 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 4425 if (LastIsNonZero) { 4426 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 4427 MVT::i16, Op.getOperand(i-1)); 4428 } 4429 if (ThisIsNonZero) { 4430 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 4431 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 4432 ThisElt, DAG.getConstant(8, MVT::i8)); 4433 if (LastIsNonZero) 4434 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 4435 } else 4436 ThisElt = LastElt; 4437 4438 if (ThisElt.getNode()) 4439 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 4440 DAG.getIntPtrConstant(i/2)); 4441 } 4442 } 4443 4444 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); 4445} 4446 4447/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 4448/// 4449static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 4450 unsigned NumNonZero, unsigned NumZero, 4451 SelectionDAG &DAG, 4452 const TargetLowering &TLI) { 4453 if (NumNonZero > 4) 4454 return SDValue(); 4455 4456 DebugLoc dl = Op.getDebugLoc(); 4457 SDValue V(0, 0); 4458 bool First = true; 4459 for (unsigned i = 0; i < 8; ++i) { 4460 bool isNonZero = (NonZeros & (1 << i)) != 0; 4461 if (isNonZero) { 4462 if (First) { 4463 if (NumZero) 4464 V = getZeroVector(MVT::v8i16, true, DAG, dl); 4465 else 4466 V = DAG.getUNDEF(MVT::v8i16); 4467 First = false; 4468 } 4469 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 4470 MVT::v8i16, V, Op.getOperand(i), 4471 DAG.getIntPtrConstant(i)); 4472 } 4473 } 4474 4475 return V; 4476} 4477 4478/// getVShift - Return a vector logical shift node. 4479/// 4480static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 4481 unsigned NumBits, SelectionDAG &DAG, 4482 const TargetLowering &TLI, DebugLoc dl) { 4483 EVT ShVT = MVT::v2i64; 4484 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 4485 SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); 4486 return DAG.getNode(ISD::BITCAST, dl, VT, 4487 DAG.getNode(Opc, dl, ShVT, SrcOp, 4488 DAG.getConstant(NumBits, 4489 TLI.getShiftAmountTy(SrcOp.getValueType())))); 4490} 4491 4492SDValue 4493X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 4494 SelectionDAG &DAG) const { 4495 4496 // Check if the scalar load can be widened into a vector load. And if 4497 // the address is "base + cst" see if the cst can be "absorbed" into 4498 // the shuffle mask. 4499 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 4500 SDValue Ptr = LD->getBasePtr(); 4501 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4502 return SDValue(); 4503 EVT PVT = LD->getValueType(0); 4504 if (PVT != MVT::i32 && PVT != MVT::f32) 4505 return SDValue(); 4506 4507 int FI = -1; 4508 int64_t Offset = 0; 4509 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4510 FI = FINode->getIndex(); 4511 Offset = 0; 4512 } else if (DAG.isBaseWithConstantOffset(Ptr) && 4513 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4514 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4515 Offset = Ptr.getConstantOperandVal(1); 4516 Ptr = Ptr.getOperand(0); 4517 } else { 4518 return SDValue(); 4519 } 4520 4521 SDValue Chain = LD->getChain(); 4522 // Make sure the stack object alignment is at least 16. 4523 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4524 if (DAG.InferPtrAlignment(Ptr) < 16) { 4525 if (MFI->isFixedObjectIndex(FI)) { 4526 // Can't change the alignment. FIXME: It's possible to compute 4527 // the exact stack offset and reference FI + adjust offset instead. 4528 // If someone *really* cares about this. That's the way to implement it. 4529 return SDValue(); 4530 } else { 4531 MFI->setObjectAlignment(FI, 16); 4532 } 4533 } 4534 4535 // (Offset % 16) must be multiple of 4. Then address is then 4536 // Ptr + (Offset & ~15). 4537 if (Offset < 0) 4538 return SDValue(); 4539 if ((Offset % 16) & 3) 4540 return SDValue(); 4541 int64_t StartOffset = Offset & ~15; 4542 if (StartOffset) 4543 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4544 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4545 4546 int EltNo = (Offset - StartOffset) >> 2; 4547 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4548 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4549 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4550 LD->getPointerInfo().getWithOffset(StartOffset), 4551 false, false, 0); 4552 // Canonicalize it to a v4i32 shuffle. 4553 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); 4554 return DAG.getNode(ISD::BITCAST, dl, VT, 4555 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4556 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4557 } 4558 4559 return SDValue(); 4560} 4561 4562/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4563/// vector of type 'VT', see if the elements can be replaced by a single large 4564/// load which has the same value as a build_vector whose operands are 'elts'. 4565/// 4566/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4567/// 4568/// FIXME: we'd also like to handle the case where the last elements are zero 4569/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4570/// There's even a handy isZeroNode for that purpose. 4571static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4572 DebugLoc &DL, SelectionDAG &DAG) { 4573 EVT EltVT = VT.getVectorElementType(); 4574 unsigned NumElems = Elts.size(); 4575 4576 LoadSDNode *LDBase = NULL; 4577 unsigned LastLoadedElt = -1U; 4578 4579 // For each element in the initializer, see if we've found a load or an undef. 4580 // If we don't find an initial load element, or later load elements are 4581 // non-consecutive, bail out. 4582 for (unsigned i = 0; i < NumElems; ++i) { 4583 SDValue Elt = Elts[i]; 4584 4585 if (!Elt.getNode() || 4586 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4587 return SDValue(); 4588 if (!LDBase) { 4589 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4590 return SDValue(); 4591 LDBase = cast<LoadSDNode>(Elt.getNode()); 4592 LastLoadedElt = i; 4593 continue; 4594 } 4595 if (Elt.getOpcode() == ISD::UNDEF) 4596 continue; 4597 4598 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4599 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4600 return SDValue(); 4601 LastLoadedElt = i; 4602 } 4603 4604 // If we have found an entire vector of loads and undefs, then return a large 4605 // load of the entire vector width starting at the base pointer. If we found 4606 // consecutive loads for the low half, generate a vzext_load node. 4607 if (LastLoadedElt == NumElems - 1) { 4608 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4609 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4610 LDBase->getPointerInfo(), 4611 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4612 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4613 LDBase->getPointerInfo(), 4614 LDBase->isVolatile(), LDBase->isNonTemporal(), 4615 LDBase->getAlignment()); 4616 } else if (NumElems == 4 && LastLoadedElt == 1 && 4617 DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { 4618 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4619 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4620 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4621 Ops, 2, MVT::i32, 4622 LDBase->getMemOperand()); 4623 return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); 4624 } 4625 return SDValue(); 4626} 4627 4628SDValue 4629X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4630 DebugLoc dl = Op.getDebugLoc(); 4631 4632 EVT VT = Op.getValueType(); 4633 EVT ExtVT = VT.getVectorElementType(); 4634 unsigned NumElems = Op.getNumOperands(); 4635 4636 // All zero's: 4637 // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX) 4638 // All one's: 4639 // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX) 4640 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4641 ISD::isBuildVectorAllOnes(Op.getNode())) { 4642 // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to 4643 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4644 // eliminated on x86-32 hosts. 4645 if (Op.getValueType() == MVT::v4i32 || 4646 Op.getValueType() == MVT::v8i32) 4647 return Op; 4648 4649 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4650 return getOnesVector(Op.getValueType(), DAG, dl); 4651 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4652 } 4653 4654 unsigned EVTBits = ExtVT.getSizeInBits(); 4655 4656 unsigned NumZero = 0; 4657 unsigned NumNonZero = 0; 4658 unsigned NonZeros = 0; 4659 bool IsAllConstants = true; 4660 SmallSet<SDValue, 8> Values; 4661 for (unsigned i = 0; i < NumElems; ++i) { 4662 SDValue Elt = Op.getOperand(i); 4663 if (Elt.getOpcode() == ISD::UNDEF) 4664 continue; 4665 Values.insert(Elt); 4666 if (Elt.getOpcode() != ISD::Constant && 4667 Elt.getOpcode() != ISD::ConstantFP) 4668 IsAllConstants = false; 4669 if (X86::isZeroNode(Elt)) 4670 NumZero++; 4671 else { 4672 NonZeros |= (1 << i); 4673 NumNonZero++; 4674 } 4675 } 4676 4677 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4678 if (NumNonZero == 0) 4679 return DAG.getUNDEF(VT); 4680 4681 // Special case for single non-zero, non-undef, element. 4682 if (NumNonZero == 1) { 4683 unsigned Idx = CountTrailingZeros_32(NonZeros); 4684 SDValue Item = Op.getOperand(Idx); 4685 4686 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4687 // the value are obviously zero, truncate the value to i32 and do the 4688 // insertion that way. Only do this if the value is non-constant or if the 4689 // value is a constant being inserted into element 0. It is cheaper to do 4690 // a constant pool load than it is to do a movd + shuffle. 4691 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4692 (!IsAllConstants || Idx == 0)) { 4693 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4694 // Handle SSE only. 4695 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4696 EVT VecVT = MVT::v4i32; 4697 unsigned VecElts = 4; 4698 4699 // Truncate the value (which may itself be a constant) to i32, and 4700 // convert it to a vector with movd (S2V+shuffle to zero extend). 4701 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4702 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4703 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4704 Subtarget->hasSSE2(), DAG); 4705 4706 // Now we have our 32-bit value zero extended in the low element of 4707 // a vector. If Idx != 0, swizzle it into place. 4708 if (Idx != 0) { 4709 SmallVector<int, 4> Mask; 4710 Mask.push_back(Idx); 4711 for (unsigned i = 1; i != VecElts; ++i) 4712 Mask.push_back(i); 4713 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4714 DAG.getUNDEF(Item.getValueType()), 4715 &Mask[0]); 4716 } 4717 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); 4718 } 4719 } 4720 4721 // If we have a constant or non-constant insertion into the low element of 4722 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4723 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4724 // depending on what the source datatype is. 4725 if (Idx == 0) { 4726 if (NumZero == 0) { 4727 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4728 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4729 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4730 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4731 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4732 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4733 DAG); 4734 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4735 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4736 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4737 EVT MiddleVT = MVT::v4i32; 4738 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4739 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4740 Subtarget->hasSSE2(), DAG); 4741 return DAG.getNode(ISD::BITCAST, dl, VT, Item); 4742 } 4743 } 4744 4745 // Is it a vector logical left shift? 4746 if (NumElems == 2 && Idx == 1 && 4747 X86::isZeroNode(Op.getOperand(0)) && 4748 !X86::isZeroNode(Op.getOperand(1))) { 4749 unsigned NumBits = VT.getSizeInBits(); 4750 return getVShift(true, VT, 4751 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4752 VT, Op.getOperand(1)), 4753 NumBits/2, DAG, *this, dl); 4754 } 4755 4756 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4757 return SDValue(); 4758 4759 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4760 // is a non-constant being inserted into an element other than the low one, 4761 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4762 // movd/movss) to move this into the low element, then shuffle it into 4763 // place. 4764 if (EVTBits == 32) { 4765 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4766 4767 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4768 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4769 Subtarget->hasSSE2(), DAG); 4770 SmallVector<int, 8> MaskVec; 4771 for (unsigned i = 0; i < NumElems; i++) 4772 MaskVec.push_back(i == Idx ? 0 : 1); 4773 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4774 } 4775 } 4776 4777 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4778 if (Values.size() == 1) { 4779 if (EVTBits == 32) { 4780 // Instead of a shuffle like this: 4781 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4782 // Check if it's possible to issue this instead. 4783 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4784 unsigned Idx = CountTrailingZeros_32(NonZeros); 4785 SDValue Item = Op.getOperand(Idx); 4786 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4787 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4788 } 4789 return SDValue(); 4790 } 4791 4792 // A vector full of immediates; various special cases are already 4793 // handled, so this is best done with a single constant-pool load. 4794 if (IsAllConstants) 4795 return SDValue(); 4796 4797 // For AVX-length vectors, build the individual 128-bit pieces and use 4798 // shuffles to put them in place. 4799 if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { 4800 SmallVector<SDValue, 32> V; 4801 for (unsigned i = 0; i < NumElems; ++i) 4802 V.push_back(Op.getOperand(i)); 4803 4804 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); 4805 4806 // Build both the lower and upper subvector. 4807 SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); 4808 SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], 4809 NumElems/2); 4810 4811 // Recreate the wider vector with the lower and upper part. 4812 SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, 4813 DAG.getConstant(0, MVT::i32), DAG, dl); 4814 return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), 4815 DAG, dl); 4816 } 4817 4818 // Let legalizer expand 2-wide build_vectors. 4819 if (EVTBits == 64) { 4820 if (NumNonZero == 1) { 4821 // One half is zero or undef. 4822 unsigned Idx = CountTrailingZeros_32(NonZeros); 4823 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4824 Op.getOperand(Idx)); 4825 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4826 Subtarget->hasSSE2(), DAG); 4827 } 4828 return SDValue(); 4829 } 4830 4831 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4832 if (EVTBits == 8 && NumElems == 16) { 4833 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4834 *this); 4835 if (V.getNode()) return V; 4836 } 4837 4838 if (EVTBits == 16 && NumElems == 8) { 4839 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4840 *this); 4841 if (V.getNode()) return V; 4842 } 4843 4844 // If element VT is == 32 bits, turn it into a number of shuffles. 4845 SmallVector<SDValue, 8> V; 4846 V.resize(NumElems); 4847 if (NumElems == 4 && NumZero > 0) { 4848 for (unsigned i = 0; i < 4; ++i) { 4849 bool isZero = !(NonZeros & (1 << i)); 4850 if (isZero) 4851 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4852 else 4853 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4854 } 4855 4856 for (unsigned i = 0; i < 2; ++i) { 4857 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4858 default: break; 4859 case 0: 4860 V[i] = V[i*2]; // Must be a zero vector. 4861 break; 4862 case 1: 4863 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4864 break; 4865 case 2: 4866 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4867 break; 4868 case 3: 4869 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4870 break; 4871 } 4872 } 4873 4874 SmallVector<int, 8> MaskVec; 4875 bool Reverse = (NonZeros & 0x3) == 2; 4876 for (unsigned i = 0; i < 2; ++i) 4877 MaskVec.push_back(Reverse ? 1-i : i); 4878 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4879 for (unsigned i = 0; i < 2; ++i) 4880 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4881 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4882 } 4883 4884 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4885 // Check for a build vector of consecutive loads. 4886 for (unsigned i = 0; i < NumElems; ++i) 4887 V[i] = Op.getOperand(i); 4888 4889 // Check for elements which are consecutive loads. 4890 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4891 if (LD.getNode()) 4892 return LD; 4893 4894 // For SSE 4.1, use insertps to put the high elements into the low element. 4895 if (getSubtarget()->hasSSE41()) { 4896 SDValue Result; 4897 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4898 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4899 else 4900 Result = DAG.getUNDEF(VT); 4901 4902 for (unsigned i = 1; i < NumElems; ++i) { 4903 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4904 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4905 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4906 } 4907 return Result; 4908 } 4909 4910 // Otherwise, expand into a number of unpckl*, start by extending each of 4911 // our (non-undef) elements to the full vector width with the element in the 4912 // bottom slot of the vector (which generates no code for SSE). 4913 for (unsigned i = 0; i < NumElems; ++i) { 4914 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4915 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4916 else 4917 V[i] = DAG.getUNDEF(VT); 4918 } 4919 4920 // Next, we iteratively mix elements, e.g. for v4f32: 4921 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4922 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4923 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4924 unsigned EltStride = NumElems >> 1; 4925 while (EltStride != 0) { 4926 for (unsigned i = 0; i < EltStride; ++i) { 4927 // If V[i+EltStride] is undef and this is the first round of mixing, 4928 // then it is safe to just drop this shuffle: V[i] is already in the 4929 // right place, the one element (since it's the first round) being 4930 // inserted as undef can be dropped. This isn't safe for successive 4931 // rounds because they will permute elements within both vectors. 4932 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4933 EltStride == NumElems/2) 4934 continue; 4935 4936 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4937 } 4938 EltStride >>= 1; 4939 } 4940 return V[0]; 4941 } 4942 return SDValue(); 4943} 4944 4945SDValue 4946X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4947 // We support concatenate two MMX registers and place them in a MMX 4948 // register. This is better than doing a stack convert. 4949 DebugLoc dl = Op.getDebugLoc(); 4950 EVT ResVT = Op.getValueType(); 4951 assert(Op.getNumOperands() == 2); 4952 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4953 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4954 int Mask[2]; 4955 SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); 4956 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4957 InVec = Op.getOperand(1); 4958 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4959 unsigned NumElts = ResVT.getVectorNumElements(); 4960 VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4961 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4962 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4963 } else { 4964 InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); 4965 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4966 Mask[0] = 0; Mask[1] = 2; 4967 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4968 } 4969 return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); 4970} 4971 4972// v8i16 shuffles - Prefer shuffles in the following order: 4973// 1. [all] pshuflw, pshufhw, optional move 4974// 2. [ssse3] 1 x pshufb 4975// 3. [ssse3] 2 x pshufb + 1 x por 4976// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4977SDValue 4978X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4979 SelectionDAG &DAG) const { 4980 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4981 SDValue V1 = SVOp->getOperand(0); 4982 SDValue V2 = SVOp->getOperand(1); 4983 DebugLoc dl = SVOp->getDebugLoc(); 4984 SmallVector<int, 8> MaskVals; 4985 4986 // Determine if more than 1 of the words in each of the low and high quadwords 4987 // of the result come from the same quadword of one of the two inputs. Undef 4988 // mask values count as coming from any quadword, for better codegen. 4989 SmallVector<unsigned, 4> LoQuad(4); 4990 SmallVector<unsigned, 4> HiQuad(4); 4991 BitVector InputQuads(4); 4992 for (unsigned i = 0; i < 8; ++i) { 4993 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4994 int EltIdx = SVOp->getMaskElt(i); 4995 MaskVals.push_back(EltIdx); 4996 if (EltIdx < 0) { 4997 ++Quad[0]; 4998 ++Quad[1]; 4999 ++Quad[2]; 5000 ++Quad[3]; 5001 continue; 5002 } 5003 ++Quad[EltIdx / 4]; 5004 InputQuads.set(EltIdx / 4); 5005 } 5006 5007 int BestLoQuad = -1; 5008 unsigned MaxQuad = 1; 5009 for (unsigned i = 0; i < 4; ++i) { 5010 if (LoQuad[i] > MaxQuad) { 5011 BestLoQuad = i; 5012 MaxQuad = LoQuad[i]; 5013 } 5014 } 5015 5016 int BestHiQuad = -1; 5017 MaxQuad = 1; 5018 for (unsigned i = 0; i < 4; ++i) { 5019 if (HiQuad[i] > MaxQuad) { 5020 BestHiQuad = i; 5021 MaxQuad = HiQuad[i]; 5022 } 5023 } 5024 5025 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 5026 // of the two input vectors, shuffle them into one input vector so only a 5027 // single pshufb instruction is necessary. If There are more than 2 input 5028 // quads, disable the next transformation since it does not help SSSE3. 5029 bool V1Used = InputQuads[0] || InputQuads[1]; 5030 bool V2Used = InputQuads[2] || InputQuads[3]; 5031 if (Subtarget->hasSSSE3()) { 5032 if (InputQuads.count() == 2 && V1Used && V2Used) { 5033 BestLoQuad = InputQuads.find_first(); 5034 BestHiQuad = InputQuads.find_next(BestLoQuad); 5035 } 5036 if (InputQuads.count() > 2) { 5037 BestLoQuad = -1; 5038 BestHiQuad = -1; 5039 } 5040 } 5041 5042 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 5043 // the shuffle mask. If a quad is scored as -1, that means that it contains 5044 // words from all 4 input quadwords. 5045 SDValue NewV; 5046 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 5047 SmallVector<int, 8> MaskV; 5048 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 5049 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 5050 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 5051 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), 5052 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); 5053 NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); 5054 5055 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 5056 // source words for the shuffle, to aid later transformations. 5057 bool AllWordsInNewV = true; 5058 bool InOrder[2] = { true, true }; 5059 for (unsigned i = 0; i != 8; ++i) { 5060 int idx = MaskVals[i]; 5061 if (idx != (int)i) 5062 InOrder[i/4] = false; 5063 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 5064 continue; 5065 AllWordsInNewV = false; 5066 break; 5067 } 5068 5069 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 5070 if (AllWordsInNewV) { 5071 for (int i = 0; i != 8; ++i) { 5072 int idx = MaskVals[i]; 5073 if (idx < 0) 5074 continue; 5075 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 5076 if ((idx != i) && idx < 4) 5077 pshufhw = false; 5078 if ((idx != i) && idx > 3) 5079 pshuflw = false; 5080 } 5081 V1 = NewV; 5082 V2Used = false; 5083 BestLoQuad = 0; 5084 BestHiQuad = 1; 5085 } 5086 5087 // If we've eliminated the use of V2, and the new mask is a pshuflw or 5088 // pshufhw, that's as cheap as it gets. Return the new shuffle. 5089 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 5090 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 5091 unsigned TargetMask = 0; 5092 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 5093 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 5094 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 5095 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 5096 V1 = NewV.getOperand(0); 5097 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 5098 } 5099 } 5100 5101 // If we have SSSE3, and all words of the result are from 1 input vector, 5102 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 5103 // is present, fall back to case 4. 5104 if (Subtarget->hasSSSE3()) { 5105 SmallVector<SDValue,16> pshufbMask; 5106 5107 // If we have elements from both input vectors, set the high bit of the 5108 // shuffle mask element to zero out elements that come from V2 in the V1 5109 // mask, and elements that come from V1 in the V2 mask, so that the two 5110 // results can be OR'd together. 5111 bool TwoInputs = V1Used && V2Used; 5112 for (unsigned i = 0; i != 8; ++i) { 5113 int EltIdx = MaskVals[i] * 2; 5114 if (TwoInputs && (EltIdx >= 16)) { 5115 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5116 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5117 continue; 5118 } 5119 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5120 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 5121 } 5122 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); 5123 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5124 DAG.getNode(ISD::BUILD_VECTOR, dl, 5125 MVT::v16i8, &pshufbMask[0], 16)); 5126 if (!TwoInputs) 5127 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5128 5129 // Calculate the shuffle mask for the second input, shuffle it, and 5130 // OR it with the first shuffled input. 5131 pshufbMask.clear(); 5132 for (unsigned i = 0; i != 8; ++i) { 5133 int EltIdx = MaskVals[i] * 2; 5134 if (EltIdx < 16) { 5135 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5136 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5137 continue; 5138 } 5139 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5140 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 5141 } 5142 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); 5143 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5144 DAG.getNode(ISD::BUILD_VECTOR, dl, 5145 MVT::v16i8, &pshufbMask[0], 16)); 5146 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5147 return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5148 } 5149 5150 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 5151 // and update MaskVals with new element order. 5152 BitVector InOrder(8); 5153 if (BestLoQuad >= 0) { 5154 SmallVector<int, 8> MaskV; 5155 for (int i = 0; i != 4; ++i) { 5156 int idx = MaskVals[i]; 5157 if (idx < 0) { 5158 MaskV.push_back(-1); 5159 InOrder.set(i); 5160 } else if ((idx / 4) == BestLoQuad) { 5161 MaskV.push_back(idx & 3); 5162 InOrder.set(i); 5163 } else { 5164 MaskV.push_back(-1); 5165 } 5166 } 5167 for (unsigned i = 4; i != 8; ++i) 5168 MaskV.push_back(i); 5169 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5170 &MaskV[0]); 5171 5172 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5173 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 5174 NewV.getOperand(0), 5175 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 5176 DAG); 5177 } 5178 5179 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 5180 // and update MaskVals with the new element order. 5181 if (BestHiQuad >= 0) { 5182 SmallVector<int, 8> MaskV; 5183 for (unsigned i = 0; i != 4; ++i) 5184 MaskV.push_back(i); 5185 for (unsigned i = 4; i != 8; ++i) { 5186 int idx = MaskVals[i]; 5187 if (idx < 0) { 5188 MaskV.push_back(-1); 5189 InOrder.set(i); 5190 } else if ((idx / 4) == BestHiQuad) { 5191 MaskV.push_back((idx & 3) + 4); 5192 InOrder.set(i); 5193 } else { 5194 MaskV.push_back(-1); 5195 } 5196 } 5197 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 5198 &MaskV[0]); 5199 5200 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 5201 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 5202 NewV.getOperand(0), 5203 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 5204 DAG); 5205 } 5206 5207 // In case BestHi & BestLo were both -1, which means each quadword has a word 5208 // from each of the four input quadwords, calculate the InOrder bitvector now 5209 // before falling through to the insert/extract cleanup. 5210 if (BestLoQuad == -1 && BestHiQuad == -1) { 5211 NewV = V1; 5212 for (int i = 0; i != 8; ++i) 5213 if (MaskVals[i] < 0 || MaskVals[i] == i) 5214 InOrder.set(i); 5215 } 5216 5217 // The other elements are put in the right place using pextrw and pinsrw. 5218 for (unsigned i = 0; i != 8; ++i) { 5219 if (InOrder[i]) 5220 continue; 5221 int EltIdx = MaskVals[i]; 5222 if (EltIdx < 0) 5223 continue; 5224 SDValue ExtOp = (EltIdx < 8) 5225 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 5226 DAG.getIntPtrConstant(EltIdx)) 5227 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 5228 DAG.getIntPtrConstant(EltIdx - 8)); 5229 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 5230 DAG.getIntPtrConstant(i)); 5231 } 5232 return NewV; 5233} 5234 5235// v16i8 shuffles - Prefer shuffles in the following order: 5236// 1. [ssse3] 1 x pshufb 5237// 2. [ssse3] 2 x pshufb + 1 x por 5238// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 5239static 5240SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 5241 SelectionDAG &DAG, 5242 const X86TargetLowering &TLI) { 5243 SDValue V1 = SVOp->getOperand(0); 5244 SDValue V2 = SVOp->getOperand(1); 5245 DebugLoc dl = SVOp->getDebugLoc(); 5246 SmallVector<int, 16> MaskVals; 5247 SVOp->getMask(MaskVals); 5248 5249 // If we have SSSE3, case 1 is generated when all result bytes come from 5250 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 5251 // present, fall back to case 3. 5252 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 5253 bool V1Only = true; 5254 bool V2Only = true; 5255 for (unsigned i = 0; i < 16; ++i) { 5256 int EltIdx = MaskVals[i]; 5257 if (EltIdx < 0) 5258 continue; 5259 if (EltIdx < 16) 5260 V2Only = false; 5261 else 5262 V1Only = false; 5263 } 5264 5265 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 5266 if (TLI.getSubtarget()->hasSSSE3()) { 5267 SmallVector<SDValue,16> pshufbMask; 5268 5269 // If all result elements are from one input vector, then only translate 5270 // undef mask values to 0x80 (zero out result) in the pshufb mask. 5271 // 5272 // Otherwise, we have elements from both input vectors, and must zero out 5273 // elements that come from V2 in the first mask, and V1 in the second mask 5274 // so that we can OR them together. 5275 bool TwoInputs = !(V1Only || V2Only); 5276 for (unsigned i = 0; i != 16; ++i) { 5277 int EltIdx = MaskVals[i]; 5278 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 5279 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5280 continue; 5281 } 5282 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 5283 } 5284 // If all the elements are from V2, assign it to V1 and return after 5285 // building the first pshufb. 5286 if (V2Only) 5287 V1 = V2; 5288 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 5289 DAG.getNode(ISD::BUILD_VECTOR, dl, 5290 MVT::v16i8, &pshufbMask[0], 16)); 5291 if (!TwoInputs) 5292 return V1; 5293 5294 // Calculate the shuffle mask for the second input, shuffle it, and 5295 // OR it with the first shuffled input. 5296 pshufbMask.clear(); 5297 for (unsigned i = 0; i != 16; ++i) { 5298 int EltIdx = MaskVals[i]; 5299 if (EltIdx < 16) { 5300 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 5301 continue; 5302 } 5303 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 5304 } 5305 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 5306 DAG.getNode(ISD::BUILD_VECTOR, dl, 5307 MVT::v16i8, &pshufbMask[0], 16)); 5308 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 5309 } 5310 5311 // No SSSE3 - Calculate in place words and then fix all out of place words 5312 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 5313 // the 16 different words that comprise the two doublequadword input vectors. 5314 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); 5315 V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); 5316 SDValue NewV = V2Only ? V2 : V1; 5317 for (int i = 0; i != 8; ++i) { 5318 int Elt0 = MaskVals[i*2]; 5319 int Elt1 = MaskVals[i*2+1]; 5320 5321 // This word of the result is all undef, skip it. 5322 if (Elt0 < 0 && Elt1 < 0) 5323 continue; 5324 5325 // This word of the result is already in the correct place, skip it. 5326 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 5327 continue; 5328 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 5329 continue; 5330 5331 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 5332 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 5333 SDValue InsElt; 5334 5335 // If Elt0 and Elt1 are defined, are consecutive, and can be load 5336 // using a single extract together, load it and store it. 5337 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 5338 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5339 DAG.getIntPtrConstant(Elt1 / 2)); 5340 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5341 DAG.getIntPtrConstant(i)); 5342 continue; 5343 } 5344 5345 // If Elt1 is defined, extract it from the appropriate source. If the 5346 // source byte is not also odd, shift the extracted word left 8 bits 5347 // otherwise clear the bottom 8 bits if we need to do an or. 5348 if (Elt1 >= 0) { 5349 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 5350 DAG.getIntPtrConstant(Elt1 / 2)); 5351 if ((Elt1 & 1) == 0) 5352 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 5353 DAG.getConstant(8, 5354 TLI.getShiftAmountTy(InsElt.getValueType()))); 5355 else if (Elt0 >= 0) 5356 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 5357 DAG.getConstant(0xFF00, MVT::i16)); 5358 } 5359 // If Elt0 is defined, extract it from the appropriate source. If the 5360 // source byte is not also even, shift the extracted word right 8 bits. If 5361 // Elt1 was also defined, OR the extracted values together before 5362 // inserting them in the result. 5363 if (Elt0 >= 0) { 5364 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 5365 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 5366 if ((Elt0 & 1) != 0) 5367 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 5368 DAG.getConstant(8, 5369 TLI.getShiftAmountTy(InsElt0.getValueType()))); 5370 else if (Elt1 >= 0) 5371 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 5372 DAG.getConstant(0x00FF, MVT::i16)); 5373 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 5374 : InsElt0; 5375 } 5376 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 5377 DAG.getIntPtrConstant(i)); 5378 } 5379 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); 5380} 5381 5382/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 5383/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 5384/// done when every pair / quad of shuffle mask elements point to elements in 5385/// the right sequence. e.g. 5386/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 5387static 5388SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 5389 SelectionDAG &DAG, DebugLoc dl) { 5390 EVT VT = SVOp->getValueType(0); 5391 SDValue V1 = SVOp->getOperand(0); 5392 SDValue V2 = SVOp->getOperand(1); 5393 unsigned NumElems = VT.getVectorNumElements(); 5394 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 5395 EVT NewVT; 5396 switch (VT.getSimpleVT().SimpleTy) { 5397 default: assert(false && "Unexpected!"); 5398 case MVT::v4f32: NewVT = MVT::v2f64; break; 5399 case MVT::v4i32: NewVT = MVT::v2i64; break; 5400 case MVT::v8i16: NewVT = MVT::v4i32; break; 5401 case MVT::v16i8: NewVT = MVT::v4i32; break; 5402 } 5403 5404 int Scale = NumElems / NewWidth; 5405 SmallVector<int, 8> MaskVec; 5406 for (unsigned i = 0; i < NumElems; i += Scale) { 5407 int StartIdx = -1; 5408 for (int j = 0; j < Scale; ++j) { 5409 int EltIdx = SVOp->getMaskElt(i+j); 5410 if (EltIdx < 0) 5411 continue; 5412 if (StartIdx == -1) 5413 StartIdx = EltIdx - (EltIdx % Scale); 5414 if (EltIdx != StartIdx + j) 5415 return SDValue(); 5416 } 5417 if (StartIdx == -1) 5418 MaskVec.push_back(-1); 5419 else 5420 MaskVec.push_back(StartIdx / Scale); 5421 } 5422 5423 V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); 5424 V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); 5425 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 5426} 5427 5428/// getVZextMovL - Return a zero-extending vector move low node. 5429/// 5430static SDValue getVZextMovL(EVT VT, EVT OpVT, 5431 SDValue SrcOp, SelectionDAG &DAG, 5432 const X86Subtarget *Subtarget, DebugLoc dl) { 5433 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 5434 LoadSDNode *LD = NULL; 5435 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 5436 LD = dyn_cast<LoadSDNode>(SrcOp); 5437 if (!LD) { 5438 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 5439 // instead. 5440 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 5441 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 5442 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 5443 SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && 5444 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 5445 // PR2108 5446 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 5447 return DAG.getNode(ISD::BITCAST, dl, VT, 5448 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5449 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5450 OpVT, 5451 SrcOp.getOperand(0) 5452 .getOperand(0)))); 5453 } 5454 } 5455 } 5456 5457 return DAG.getNode(ISD::BITCAST, dl, VT, 5458 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 5459 DAG.getNode(ISD::BITCAST, dl, 5460 OpVT, SrcOp))); 5461} 5462 5463/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles 5464/// which could not be matched by any known target speficic shuffle 5465static SDValue 5466LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5467 return SDValue(); 5468} 5469 5470/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with 5471/// 4 elements, and match them with several different shuffle types. 5472static SDValue 5473LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 5474 SDValue V1 = SVOp->getOperand(0); 5475 SDValue V2 = SVOp->getOperand(1); 5476 DebugLoc dl = SVOp->getDebugLoc(); 5477 EVT VT = SVOp->getValueType(0); 5478 5479 assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); 5480 5481 SmallVector<std::pair<int, int>, 8> Locs; 5482 Locs.resize(4); 5483 SmallVector<int, 8> Mask1(4U, -1); 5484 SmallVector<int, 8> PermMask; 5485 SVOp->getMask(PermMask); 5486 5487 unsigned NumHi = 0; 5488 unsigned NumLo = 0; 5489 for (unsigned i = 0; i != 4; ++i) { 5490 int Idx = PermMask[i]; 5491 if (Idx < 0) { 5492 Locs[i] = std::make_pair(-1, -1); 5493 } else { 5494 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 5495 if (Idx < 4) { 5496 Locs[i] = std::make_pair(0, NumLo); 5497 Mask1[NumLo] = Idx; 5498 NumLo++; 5499 } else { 5500 Locs[i] = std::make_pair(1, NumHi); 5501 if (2+NumHi < 4) 5502 Mask1[2+NumHi] = Idx; 5503 NumHi++; 5504 } 5505 } 5506 } 5507 5508 if (NumLo <= 2 && NumHi <= 2) { 5509 // If no more than two elements come from either vector. This can be 5510 // implemented with two shuffles. First shuffle gather the elements. 5511 // The second shuffle, which takes the first shuffle as both of its 5512 // vector operands, put the elements into the right order. 5513 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5514 5515 SmallVector<int, 8> Mask2(4U, -1); 5516 5517 for (unsigned i = 0; i != 4; ++i) { 5518 if (Locs[i].first == -1) 5519 continue; 5520 else { 5521 unsigned Idx = (i < 2) ? 0 : 4; 5522 Idx += Locs[i].first * 2 + Locs[i].second; 5523 Mask2[i] = Idx; 5524 } 5525 } 5526 5527 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 5528 } else if (NumLo == 3 || NumHi == 3) { 5529 // Otherwise, we must have three elements from one vector, call it X, and 5530 // one element from the other, call it Y. First, use a shufps to build an 5531 // intermediate vector with the one element from Y and the element from X 5532 // that will be in the same half in the final destination (the indexes don't 5533 // matter). Then, use a shufps to build the final vector, taking the half 5534 // containing the element from Y from the intermediate, and the other half 5535 // from X. 5536 if (NumHi == 3) { 5537 // Normalize it so the 3 elements come from V1. 5538 CommuteVectorShuffleMask(PermMask, VT); 5539 std::swap(V1, V2); 5540 } 5541 5542 // Find the element from V2. 5543 unsigned HiIndex; 5544 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5545 int Val = PermMask[HiIndex]; 5546 if (Val < 0) 5547 continue; 5548 if (Val >= 4) 5549 break; 5550 } 5551 5552 Mask1[0] = PermMask[HiIndex]; 5553 Mask1[1] = -1; 5554 Mask1[2] = PermMask[HiIndex^1]; 5555 Mask1[3] = -1; 5556 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5557 5558 if (HiIndex >= 2) { 5559 Mask1[0] = PermMask[0]; 5560 Mask1[1] = PermMask[1]; 5561 Mask1[2] = HiIndex & 1 ? 6 : 4; 5562 Mask1[3] = HiIndex & 1 ? 4 : 6; 5563 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5564 } else { 5565 Mask1[0] = HiIndex & 1 ? 2 : 0; 5566 Mask1[1] = HiIndex & 1 ? 0 : 2; 5567 Mask1[2] = PermMask[2]; 5568 Mask1[3] = PermMask[3]; 5569 if (Mask1[2] >= 0) 5570 Mask1[2] += 4; 5571 if (Mask1[3] >= 0) 5572 Mask1[3] += 4; 5573 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5574 } 5575 } 5576 5577 // Break it into (shuffle shuffle_hi, shuffle_lo). 5578 Locs.clear(); 5579 Locs.resize(4); 5580 SmallVector<int,8> LoMask(4U, -1); 5581 SmallVector<int,8> HiMask(4U, -1); 5582 5583 SmallVector<int,8> *MaskPtr = &LoMask; 5584 unsigned MaskIdx = 0; 5585 unsigned LoIdx = 0; 5586 unsigned HiIdx = 2; 5587 for (unsigned i = 0; i != 4; ++i) { 5588 if (i == 2) { 5589 MaskPtr = &HiMask; 5590 MaskIdx = 1; 5591 LoIdx = 0; 5592 HiIdx = 2; 5593 } 5594 int Idx = PermMask[i]; 5595 if (Idx < 0) { 5596 Locs[i] = std::make_pair(-1, -1); 5597 } else if (Idx < 4) { 5598 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5599 (*MaskPtr)[LoIdx] = Idx; 5600 LoIdx++; 5601 } else { 5602 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5603 (*MaskPtr)[HiIdx] = Idx; 5604 HiIdx++; 5605 } 5606 } 5607 5608 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5609 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5610 SmallVector<int, 8> MaskOps; 5611 for (unsigned i = 0; i != 4; ++i) { 5612 if (Locs[i].first == -1) { 5613 MaskOps.push_back(-1); 5614 } else { 5615 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5616 MaskOps.push_back(Idx); 5617 } 5618 } 5619 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5620} 5621 5622static bool MayFoldVectorLoad(SDValue V) { 5623 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5624 V = V.getOperand(0); 5625 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5626 V = V.getOperand(0); 5627 if (MayFoldLoad(V)) 5628 return true; 5629 return false; 5630} 5631 5632// FIXME: the version above should always be used. Since there's 5633// a bug where several vector shuffles can't be folded because the 5634// DAG is not updated during lowering and a node claims to have two 5635// uses while it only has one, use this version, and let isel match 5636// another instruction if the load really happens to have more than 5637// one use. Remove this version after this bug get fixed. 5638// rdar://8434668, PR8156 5639static bool RelaxedMayFoldVectorLoad(SDValue V) { 5640 if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) 5641 V = V.getOperand(0); 5642 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5643 V = V.getOperand(0); 5644 if (ISD::isNormalLoad(V.getNode())) 5645 return true; 5646 return false; 5647} 5648 5649/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5650/// a vector extract, and if both can be later optimized into a single load. 5651/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5652/// here because otherwise a target specific shuffle node is going to be 5653/// emitted for this shuffle, and the optimization not done. 5654/// FIXME: This is probably not the best approach, but fix the problem 5655/// until the right path is decided. 5656static 5657bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5658 const TargetLowering &TLI) { 5659 EVT VT = V.getValueType(); 5660 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5661 5662 // Be sure that the vector shuffle is present in a pattern like this: 5663 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5664 if (!V.hasOneUse()) 5665 return false; 5666 5667 SDNode *N = *V.getNode()->use_begin(); 5668 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5669 return false; 5670 5671 SDValue EltNo = N->getOperand(1); 5672 if (!isa<ConstantSDNode>(EltNo)) 5673 return false; 5674 5675 // If the bit convert changed the number of elements, it is unsafe 5676 // to examine the mask. 5677 bool HasShuffleIntoBitcast = false; 5678 if (V.getOpcode() == ISD::BITCAST) { 5679 EVT SrcVT = V.getOperand(0).getValueType(); 5680 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5681 return false; 5682 V = V.getOperand(0); 5683 HasShuffleIntoBitcast = true; 5684 } 5685 5686 // Select the input vector, guarding against out of range extract vector. 5687 unsigned NumElems = VT.getVectorNumElements(); 5688 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5689 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5690 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5691 5692 // Skip one more bit_convert if necessary 5693 if (V.getOpcode() == ISD::BITCAST) 5694 V = V.getOperand(0); 5695 5696 if (ISD::isNormalLoad(V.getNode())) { 5697 // Is the original load suitable? 5698 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5699 5700 // FIXME: avoid the multi-use bug that is preventing lots of 5701 // of foldings to be detected, this is still wrong of course, but 5702 // give the temporary desired behavior, and if it happens that 5703 // the load has real more uses, during isel it will not fold, and 5704 // will generate poor code. 5705 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5706 return false; 5707 5708 if (!HasShuffleIntoBitcast) 5709 return true; 5710 5711 // If there's a bitcast before the shuffle, check if the load type and 5712 // alignment is valid. 5713 unsigned Align = LN0->getAlignment(); 5714 unsigned NewAlign = 5715 TLI.getTargetData()->getABITypeAlignment( 5716 VT.getTypeForEVT(*DAG.getContext())); 5717 5718 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5719 return false; 5720 } 5721 5722 return true; 5723} 5724 5725static 5726SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5727 EVT VT = Op.getValueType(); 5728 5729 // Canonizalize to v2f64. 5730 V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); 5731 return DAG.getNode(ISD::BITCAST, dl, VT, 5732 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5733 V1, DAG)); 5734} 5735 5736static 5737SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5738 bool HasSSE2) { 5739 SDValue V1 = Op.getOperand(0); 5740 SDValue V2 = Op.getOperand(1); 5741 EVT VT = Op.getValueType(); 5742 5743 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5744 5745 if (HasSSE2 && VT == MVT::v2f64) 5746 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5747 5748 // v4f32 or v4i32 5749 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5750} 5751 5752static 5753SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5754 SDValue V1 = Op.getOperand(0); 5755 SDValue V2 = Op.getOperand(1); 5756 EVT VT = Op.getValueType(); 5757 5758 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5759 "unsupported shuffle type"); 5760 5761 if (V2.getOpcode() == ISD::UNDEF) 5762 V2 = V1; 5763 5764 // v4i32 or v4f32 5765 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5766} 5767 5768static 5769SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5770 SDValue V1 = Op.getOperand(0); 5771 SDValue V2 = Op.getOperand(1); 5772 EVT VT = Op.getValueType(); 5773 unsigned NumElems = VT.getVectorNumElements(); 5774 5775 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5776 // operand of these instructions is only memory, so check if there's a 5777 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5778 // same masks. 5779 bool CanFoldLoad = false; 5780 5781 // Trivial case, when V2 comes from a load. 5782 if (MayFoldVectorLoad(V2)) 5783 CanFoldLoad = true; 5784 5785 // When V1 is a load, it can be folded later into a store in isel, example: 5786 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5787 // turns into: 5788 // (MOVLPSmr addr:$src1, VR128:$src2) 5789 // So, recognize this potential and also use MOVLPS or MOVLPD 5790 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5791 CanFoldLoad = true; 5792 5793 // Both of them can't be memory operations though. 5794 if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) 5795 CanFoldLoad = false; 5796 5797 if (CanFoldLoad) { 5798 if (HasSSE2 && NumElems == 2) 5799 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5800 5801 if (NumElems == 4) 5802 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5803 } 5804 5805 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5806 // movl and movlp will both match v2i64, but v2i64 is never matched by 5807 // movl earlier because we make it strict to avoid messing with the movlp load 5808 // folding logic (see the code above getMOVLP call). Match it here then, 5809 // this is horrible, but will stay like this until we move all shuffle 5810 // matching to x86 specific nodes. Note that for the 1st condition all 5811 // types are matched with movsd. 5812 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5813 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5814 else if (HasSSE2) 5815 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5816 5817 5818 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5819 5820 // Invert the operand order and use SHUFPS to match it. 5821 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5822 X86::getShuffleSHUFImmediate(SVOp), DAG); 5823} 5824 5825static inline unsigned getUNPCKLOpcode(EVT VT) { 5826 switch(VT.getSimpleVT().SimpleTy) { 5827 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5828 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5829 case MVT::v4f32: return X86ISD::UNPCKLPS; 5830 case MVT::v2f64: return X86ISD::UNPCKLPD; 5831 case MVT::v8f32: return X86ISD::VUNPCKLPSY; 5832 case MVT::v4f64: return X86ISD::VUNPCKLPDY; 5833 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5834 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5835 default: 5836 llvm_unreachable("Unknown type for unpckl"); 5837 } 5838 return 0; 5839} 5840 5841static inline unsigned getUNPCKHOpcode(EVT VT) { 5842 switch(VT.getSimpleVT().SimpleTy) { 5843 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5844 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5845 case MVT::v4f32: return X86ISD::UNPCKHPS; 5846 case MVT::v2f64: return X86ISD::UNPCKHPD; 5847 case MVT::v8f32: return X86ISD::VUNPCKHPSY; 5848 case MVT::v4f64: return X86ISD::VUNPCKHPDY; 5849 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5850 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5851 default: 5852 llvm_unreachable("Unknown type for unpckh"); 5853 } 5854 return 0; 5855} 5856 5857static inline unsigned getVPERMILOpcode(EVT VT) { 5858 switch(VT.getSimpleVT().SimpleTy) { 5859 case MVT::v4i32: 5860 case MVT::v4f32: return X86ISD::VPERMILPS; 5861 case MVT::v2i64: 5862 case MVT::v2f64: return X86ISD::VPERMILPD; 5863 case MVT::v8i32: 5864 case MVT::v8f32: return X86ISD::VPERMILPSY; 5865 case MVT::v4i64: 5866 case MVT::v4f64: return X86ISD::VPERMILPDY; 5867 default: 5868 llvm_unreachable("Unknown type for vpermil"); 5869 } 5870 return 0; 5871} 5872 5873static 5874SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5875 const TargetLowering &TLI, 5876 const X86Subtarget *Subtarget) { 5877 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5878 EVT VT = Op.getValueType(); 5879 DebugLoc dl = Op.getDebugLoc(); 5880 SDValue V1 = Op.getOperand(0); 5881 SDValue V2 = Op.getOperand(1); 5882 5883 if (isZeroShuffle(SVOp)) 5884 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5885 5886 // Handle splat operations 5887 if (SVOp->isSplat()) { 5888 unsigned NumElem = VT.getVectorNumElements(); 5889 // Special case, this is the only place now where it's allowed to return 5890 // a vector_shuffle operation without using a target specific node, because 5891 // *hopefully* it will be optimized away by the dag combiner. FIXME: should 5892 // this be moved to DAGCombine instead? 5893 if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5894 return Op; 5895 5896 // Since there's no native support for scalar_to_vector for 256-bit AVX, a 5897 // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this 5898 // idiom and do the shuffle before the insertion, this yields less 5899 // instructions in the end. 5900 if (VT.is256BitVector() && 5901 V1.getOpcode() == ISD::INSERT_SUBVECTOR && 5902 V1.getOperand(0).getOpcode() == ISD::UNDEF && 5903 V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR) 5904 return PromoteVectorToScalarSplat(SVOp, DAG); 5905 5906 // Handle splats by matching through known shuffle masks 5907 if ((VT.is128BitVector() && NumElem <= 4) || 5908 (VT.is256BitVector() && NumElem <= 8)) 5909 return SDValue(); 5910 5911 // All i16 and i8 vector types can't be used directly by a generic shuffle 5912 // instruction because the target has no such instruction. Generate shuffles 5913 // which repeat i16 and i8 several times until they fit in i32, and then can 5914 // be manipulated by target suported shuffles. After the insertion of the 5915 // necessary shuffles, the result is bitcasted back to v4f32 or v8f32. 5916 return PromoteSplat(SVOp, DAG); 5917 } 5918 5919 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5920 // do it! 5921 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5922 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5923 if (NewOp.getNode()) 5924 return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); 5925 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5926 // FIXME: Figure out a cleaner way to do this. 5927 // Try to make use of movq to zero out the top part. 5928 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5929 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5930 if (NewOp.getNode()) { 5931 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5932 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5933 DAG, Subtarget, dl); 5934 } 5935 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5936 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5937 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5938 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5939 DAG, Subtarget, dl); 5940 } 5941 } 5942 return SDValue(); 5943} 5944 5945SDValue 5946X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5947 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5948 SDValue V1 = Op.getOperand(0); 5949 SDValue V2 = Op.getOperand(1); 5950 EVT VT = Op.getValueType(); 5951 DebugLoc dl = Op.getDebugLoc(); 5952 unsigned NumElems = VT.getVectorNumElements(); 5953 bool isMMX = VT.getSizeInBits() == 64; 5954 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5955 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5956 bool V1IsSplat = false; 5957 bool V2IsSplat = false; 5958 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5959 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5960 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5961 MachineFunction &MF = DAG.getMachineFunction(); 5962 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5963 5964 // Shuffle operations on MMX not supported. 5965 if (isMMX) 5966 return Op; 5967 5968 // Vector shuffle lowering takes 3 steps: 5969 // 5970 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5971 // narrowing and commutation of operands should be handled. 5972 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5973 // shuffle nodes. 5974 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5975 // so the shuffle can be broken into other shuffles and the legalizer can 5976 // try the lowering again. 5977 // 5978 // The general ideia is that no vector_shuffle operation should be left to 5979 // be matched during isel, all of them must be converted to a target specific 5980 // node here. 5981 5982 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5983 // narrowing and commutation of operands should be handled. The actual code 5984 // doesn't include all of those, work in progress... 5985 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5986 if (NewOp.getNode()) 5987 return NewOp; 5988 5989 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5990 // unpckh_undef). Only use pshufd if speed is more important than size. 5991 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5992 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5993 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5994 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5995 5996 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5997 RelaxedMayFoldVectorLoad(V1)) 5998 return getMOVDDup(Op, dl, V1, DAG); 5999 6000 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 6001 return getMOVHighToLow(Op, dl, DAG); 6002 6003 // Use to match splats 6004 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 6005 (VT == MVT::v2f64 || VT == MVT::v2i64)) 6006 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6007 6008 if (X86::isPSHUFDMask(SVOp)) { 6009 // The actual implementation will match the mask in the if above and then 6010 // during isel it can match several different instructions, not only pshufd 6011 // as its name says, sad but true, emulate the behavior for now... 6012 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 6013 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 6014 6015 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6016 6017 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 6018 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 6019 6020 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6021 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 6022 TargetMask, DAG); 6023 6024 if (VT == MVT::v4f32) 6025 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 6026 TargetMask, DAG); 6027 } 6028 6029 // Check if this can be converted into a logical shift. 6030 bool isLeft = false; 6031 unsigned ShAmt = 0; 6032 SDValue ShVal; 6033 bool isShift = getSubtarget()->hasSSE2() && 6034 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 6035 if (isShift && ShVal.hasOneUse()) { 6036 // If the shifted value has multiple uses, it may be cheaper to use 6037 // v_set0 + movlhps or movhlps, etc. 6038 EVT EltVT = VT.getVectorElementType(); 6039 ShAmt *= EltVT.getSizeInBits(); 6040 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6041 } 6042 6043 if (X86::isMOVLMask(SVOp)) { 6044 if (V1IsUndef) 6045 return V2; 6046 if (ISD::isBuildVectorAllZeros(V1.getNode())) 6047 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 6048 if (!X86::isMOVLPMask(SVOp)) { 6049 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 6050 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 6051 6052 if (VT == MVT::v4i32 || VT == MVT::v4f32) 6053 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 6054 } 6055 } 6056 6057 // FIXME: fold these into legal mask. 6058 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 6059 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 6060 6061 if (X86::isMOVHLPSMask(SVOp)) 6062 return getMOVHighToLow(Op, dl, DAG); 6063 6064 if (X86::isMOVSHDUPMask(SVOp, Subtarget)) 6065 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 6066 6067 if (X86::isMOVSLDUPMask(SVOp, Subtarget)) 6068 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 6069 6070 if (X86::isMOVLPMask(SVOp)) 6071 return getMOVLP(Op, dl, DAG, HasSSE2); 6072 6073 if (ShouldXformToMOVHLPS(SVOp) || 6074 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 6075 return CommuteVectorShuffle(SVOp, DAG); 6076 6077 if (isShift) { 6078 // No better options. Use a vshl / vsrl. 6079 EVT EltVT = VT.getVectorElementType(); 6080 ShAmt *= EltVT.getSizeInBits(); 6081 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 6082 } 6083 6084 bool Commuted = false; 6085 // FIXME: This should also accept a bitcast of a splat? Be careful, not 6086 // 1,1,1,1 -> v8i16 though. 6087 V1IsSplat = isSplatVector(V1.getNode()); 6088 V2IsSplat = isSplatVector(V2.getNode()); 6089 6090 // Canonicalize the splat or undef, if present, to be on the RHS. 6091 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 6092 Op = CommuteVectorShuffle(SVOp, DAG); 6093 SVOp = cast<ShuffleVectorSDNode>(Op); 6094 V1 = SVOp->getOperand(0); 6095 V2 = SVOp->getOperand(1); 6096 std::swap(V1IsSplat, V2IsSplat); 6097 std::swap(V1IsUndef, V2IsUndef); 6098 Commuted = true; 6099 } 6100 6101 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 6102 // Shuffling low element of v1 into undef, just return v1. 6103 if (V2IsUndef) 6104 return V1; 6105 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 6106 // the instruction selector will not match, so get a canonical MOVL with 6107 // swapped operands to undo the commute. 6108 return getMOVL(DAG, dl, VT, V2, V1); 6109 } 6110 6111 if (X86::isUNPCKLMask(SVOp)) 6112 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 6113 6114 if (X86::isUNPCKHMask(SVOp)) 6115 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 6116 6117 if (V2IsSplat) { 6118 // Normalize mask so all entries that point to V2 points to its first 6119 // element then try to match unpck{h|l} again. If match, return a 6120 // new vector_shuffle with the corrected mask. 6121 SDValue NewMask = NormalizeMask(SVOp, DAG); 6122 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 6123 if (NSVOp != SVOp) { 6124 if (X86::isUNPCKLMask(NSVOp, true)) { 6125 return NewMask; 6126 } else if (X86::isUNPCKHMask(NSVOp, true)) { 6127 return NewMask; 6128 } 6129 } 6130 } 6131 6132 if (Commuted) { 6133 // Commute is back and try unpck* again. 6134 // FIXME: this seems wrong. 6135 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 6136 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 6137 6138 if (X86::isUNPCKLMask(NewSVOp)) 6139 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 6140 6141 if (X86::isUNPCKHMask(NewSVOp)) 6142 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 6143 } 6144 6145 // Normalize the node to match x86 shuffle ops if needed 6146 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 6147 return CommuteVectorShuffle(SVOp, DAG); 6148 6149 // The checks below are all present in isShuffleMaskLegal, but they are 6150 // inlined here right now to enable us to directly emit target specific 6151 // nodes, and remove one by one until they don't return Op anymore. 6152 SmallVector<int, 16> M; 6153 SVOp->getMask(M); 6154 6155 if (isPALIGNRMask(M, VT, HasSSSE3)) 6156 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 6157 X86::getShufflePALIGNRImmediate(SVOp), 6158 DAG); 6159 6160 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 6161 SVOp->getSplatIndex() == 0 && V2IsUndef) { 6162 if (VT == MVT::v2f64) 6163 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 6164 if (VT == MVT::v2i64) 6165 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 6166 } 6167 6168 if (isPSHUFHWMask(M, VT)) 6169 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 6170 X86::getShufflePSHUFHWImmediate(SVOp), 6171 DAG); 6172 6173 if (isPSHUFLWMask(M, VT)) 6174 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 6175 X86::getShufflePSHUFLWImmediate(SVOp), 6176 DAG); 6177 6178 if (isSHUFPMask(M, VT)) { 6179 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 6180 if (VT == MVT::v4f32 || VT == MVT::v4i32) 6181 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 6182 TargetMask, DAG); 6183 if (VT == MVT::v2f64 || VT == MVT::v2i64) 6184 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 6185 TargetMask, DAG); 6186 } 6187 6188 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 6189 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 6190 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 6191 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 6192 6193 //===--------------------------------------------------------------------===// 6194 // Generate target specific nodes for 128 or 256-bit shuffles only 6195 // supported in the AVX instruction set. 6196 // 6197 6198 // Handle VPERMILPS* permutations 6199 if (isVPERMILPSMask(M, VT, Subtarget)) 6200 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6201 getShuffleVPERMILPSImmediate(SVOp), DAG); 6202 6203 // Handle VPERMILPD* permutations 6204 if (isVPERMILPDMask(M, VT, Subtarget)) 6205 return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, 6206 getShuffleVPERMILPDImmediate(SVOp), DAG); 6207 6208 //===--------------------------------------------------------------------===// 6209 // Since no target specific shuffle was selected for this generic one, 6210 // lower it into other known shuffles. FIXME: this isn't true yet, but 6211 // this is the plan. 6212 // 6213 6214 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 6215 if (VT == MVT::v8i16) { 6216 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 6217 if (NewOp.getNode()) 6218 return NewOp; 6219 } 6220 6221 if (VT == MVT::v16i8) { 6222 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 6223 if (NewOp.getNode()) 6224 return NewOp; 6225 } 6226 6227 // Handle all 128-bit wide vectors with 4 elements, and match them with 6228 // several different shuffle types. 6229 if (NumElems == 4 && VT.getSizeInBits() == 128) 6230 return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); 6231 6232 // Handle general 256-bit shuffles 6233 if (VT.is256BitVector()) 6234 return LowerVECTOR_SHUFFLE_256(SVOp, DAG); 6235 6236 return SDValue(); 6237} 6238 6239SDValue 6240X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 6241 SelectionDAG &DAG) const { 6242 EVT VT = Op.getValueType(); 6243 DebugLoc dl = Op.getDebugLoc(); 6244 if (VT.getSizeInBits() == 8) { 6245 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 6246 Op.getOperand(0), Op.getOperand(1)); 6247 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6248 DAG.getValueType(VT)); 6249 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6250 } else if (VT.getSizeInBits() == 16) { 6251 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6252 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 6253 if (Idx == 0) 6254 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6255 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6256 DAG.getNode(ISD::BITCAST, dl, 6257 MVT::v4i32, 6258 Op.getOperand(0)), 6259 Op.getOperand(1))); 6260 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 6261 Op.getOperand(0), Op.getOperand(1)); 6262 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 6263 DAG.getValueType(VT)); 6264 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6265 } else if (VT == MVT::f32) { 6266 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 6267 // the result back to FR32 register. It's only worth matching if the 6268 // result has a single use which is a store or a bitcast to i32. And in 6269 // the case of a store, it's not worth it if the index is a constant 0, 6270 // because a MOVSSmr can be used instead, which is smaller and faster. 6271 if (!Op.hasOneUse()) 6272 return SDValue(); 6273 SDNode *User = *Op.getNode()->use_begin(); 6274 if ((User->getOpcode() != ISD::STORE || 6275 (isa<ConstantSDNode>(Op.getOperand(1)) && 6276 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 6277 (User->getOpcode() != ISD::BITCAST || 6278 User->getValueType(0) != MVT::i32)) 6279 return SDValue(); 6280 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6281 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, 6282 Op.getOperand(0)), 6283 Op.getOperand(1)); 6284 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); 6285 } else if (VT == MVT::i32) { 6286 // ExtractPS works with constant index. 6287 if (isa<ConstantSDNode>(Op.getOperand(1))) 6288 return Op; 6289 } 6290 return SDValue(); 6291} 6292 6293 6294SDValue 6295X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6296 SelectionDAG &DAG) const { 6297 if (!isa<ConstantSDNode>(Op.getOperand(1))) 6298 return SDValue(); 6299 6300 SDValue Vec = Op.getOperand(0); 6301 EVT VecVT = Vec.getValueType(); 6302 6303 // If this is a 256-bit vector result, first extract the 128-bit 6304 // vector and then extract from the 128-bit vector. 6305 if (VecVT.getSizeInBits() > 128) { 6306 DebugLoc dl = Op.getNode()->getDebugLoc(); 6307 unsigned NumElems = VecVT.getVectorNumElements(); 6308 SDValue Idx = Op.getOperand(1); 6309 6310 if (!isa<ConstantSDNode>(Idx)) 6311 return SDValue(); 6312 6313 unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128); 6314 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); 6315 6316 // Get the 128-bit vector. 6317 bool Upper = IdxVal >= ExtractNumElems; 6318 Vec = Extract128BitVector(Vec, Idx, DAG, dl); 6319 6320 // Extract from it. 6321 SDValue ScaledIdx = Idx; 6322 if (Upper) 6323 ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx, 6324 DAG.getConstant(ExtractNumElems, 6325 Idx.getValueType())); 6326 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, 6327 ScaledIdx); 6328 } 6329 6330 assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); 6331 6332 if (Subtarget->hasSSE41()) { 6333 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 6334 if (Res.getNode()) 6335 return Res; 6336 } 6337 6338 EVT VT = Op.getValueType(); 6339 DebugLoc dl = Op.getDebugLoc(); 6340 // TODO: handle v16i8. 6341 if (VT.getSizeInBits() == 16) { 6342 SDValue Vec = Op.getOperand(0); 6343 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6344 if (Idx == 0) 6345 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 6346 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 6347 DAG.getNode(ISD::BITCAST, dl, 6348 MVT::v4i32, Vec), 6349 Op.getOperand(1))); 6350 // Transform it so it match pextrw which produces a 32-bit result. 6351 EVT EltVT = MVT::i32; 6352 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 6353 Op.getOperand(0), Op.getOperand(1)); 6354 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 6355 DAG.getValueType(VT)); 6356 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 6357 } else if (VT.getSizeInBits() == 32) { 6358 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6359 if (Idx == 0) 6360 return Op; 6361 6362 // SHUFPS the element to the lowest double word, then movss. 6363 int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; 6364 EVT VVT = Op.getOperand(0).getValueType(); 6365 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6366 DAG.getUNDEF(VVT), Mask); 6367 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6368 DAG.getIntPtrConstant(0)); 6369 } else if (VT.getSizeInBits() == 64) { 6370 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 6371 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 6372 // to match extract_elt for f64. 6373 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 6374 if (Idx == 0) 6375 return Op; 6376 6377 // UNPCKHPD the element to the lowest double word, then movsd. 6378 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 6379 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 6380 int Mask[2] = { 1, -1 }; 6381 EVT VVT = Op.getOperand(0).getValueType(); 6382 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 6383 DAG.getUNDEF(VVT), Mask); 6384 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 6385 DAG.getIntPtrConstant(0)); 6386 } 6387 6388 return SDValue(); 6389} 6390 6391SDValue 6392X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 6393 SelectionDAG &DAG) const { 6394 EVT VT = Op.getValueType(); 6395 EVT EltVT = VT.getVectorElementType(); 6396 DebugLoc dl = Op.getDebugLoc(); 6397 6398 SDValue N0 = Op.getOperand(0); 6399 SDValue N1 = Op.getOperand(1); 6400 SDValue N2 = Op.getOperand(2); 6401 6402 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 6403 isa<ConstantSDNode>(N2)) { 6404 unsigned Opc; 6405 if (VT == MVT::v8i16) 6406 Opc = X86ISD::PINSRW; 6407 else if (VT == MVT::v16i8) 6408 Opc = X86ISD::PINSRB; 6409 else 6410 Opc = X86ISD::PINSRB; 6411 6412 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 6413 // argument. 6414 if (N1.getValueType() != MVT::i32) 6415 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6416 if (N2.getValueType() != MVT::i32) 6417 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6418 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 6419 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 6420 // Bits [7:6] of the constant are the source select. This will always be 6421 // zero here. The DAG Combiner may combine an extract_elt index into these 6422 // bits. For example (insert (extract, 3), 2) could be matched by putting 6423 // the '3' into bits [7:6] of X86ISD::INSERTPS. 6424 // Bits [5:4] of the constant are the destination select. This is the 6425 // value of the incoming immediate. 6426 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 6427 // combine either bitwise AND or insert of float 0.0 to set these bits. 6428 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 6429 // Create this as a scalar to vector.. 6430 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 6431 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 6432 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 6433 // PINSR* works with constant index. 6434 return Op; 6435 } 6436 return SDValue(); 6437} 6438 6439SDValue 6440X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 6441 EVT VT = Op.getValueType(); 6442 EVT EltVT = VT.getVectorElementType(); 6443 6444 DebugLoc dl = Op.getDebugLoc(); 6445 SDValue N0 = Op.getOperand(0); 6446 SDValue N1 = Op.getOperand(1); 6447 SDValue N2 = Op.getOperand(2); 6448 6449 // If this is a 256-bit vector result, first insert into a 128-bit 6450 // vector and then insert into the 256-bit vector. 6451 if (VT.getSizeInBits() > 128) { 6452 if (!isa<ConstantSDNode>(N2)) 6453 return SDValue(); 6454 6455 // Get the 128-bit vector. 6456 unsigned NumElems = VT.getVectorNumElements(); 6457 unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); 6458 bool Upper = IdxVal >= NumElems / 2; 6459 6460 SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl); 6461 6462 // Insert into it. 6463 SDValue ScaledN2 = N2; 6464 if (Upper) 6465 ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2, 6466 DAG.getConstant(NumElems / 6467 (VT.getSizeInBits() / 128), 6468 N2.getValueType())); 6469 Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0, 6470 N1, ScaledN2); 6471 6472 // Insert the 128-bit vector 6473 // FIXME: Why UNDEF? 6474 return Insert128BitVector(N0, Op, N2, DAG, dl); 6475 } 6476 6477 if (Subtarget->hasSSE41()) 6478 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 6479 6480 if (EltVT == MVT::i8) 6481 return SDValue(); 6482 6483 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 6484 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 6485 // as its second argument. 6486 if (N1.getValueType() != MVT::i32) 6487 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 6488 if (N2.getValueType() != MVT::i32) 6489 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 6490 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 6491 } 6492 return SDValue(); 6493} 6494 6495SDValue 6496X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 6497 LLVMContext *Context = DAG.getContext(); 6498 DebugLoc dl = Op.getDebugLoc(); 6499 EVT OpVT = Op.getValueType(); 6500 6501 // If this is a 256-bit vector result, first insert into a 128-bit 6502 // vector and then insert into the 256-bit vector. 6503 if (OpVT.getSizeInBits() > 128) { 6504 // Insert into a 128-bit vector. 6505 EVT VT128 = EVT::getVectorVT(*Context, 6506 OpVT.getVectorElementType(), 6507 OpVT.getVectorNumElements() / 2); 6508 6509 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); 6510 6511 // Insert the 128-bit vector. 6512 return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, 6513 DAG.getConstant(0, MVT::i32), 6514 DAG, dl); 6515 } 6516 6517 if (Op.getValueType() == MVT::v1i64 && 6518 Op.getOperand(0).getValueType() == MVT::i64) 6519 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 6520 6521 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 6522 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 6523 "Expected an SSE type!"); 6524 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), 6525 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 6526} 6527 6528// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in 6529// a simple subregister reference or explicit instructions to grab 6530// upper bits of a vector. 6531SDValue 6532X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6533 if (Subtarget->hasAVX()) { 6534 DebugLoc dl = Op.getNode()->getDebugLoc(); 6535 SDValue Vec = Op.getNode()->getOperand(0); 6536 SDValue Idx = Op.getNode()->getOperand(1); 6537 6538 if (Op.getNode()->getValueType(0).getSizeInBits() == 128 6539 && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { 6540 return Extract128BitVector(Vec, Idx, DAG, dl); 6541 } 6542 } 6543 return SDValue(); 6544} 6545 6546// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a 6547// simple superregister reference or explicit instructions to insert 6548// the upper bits of a vector. 6549SDValue 6550X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { 6551 if (Subtarget->hasAVX()) { 6552 DebugLoc dl = Op.getNode()->getDebugLoc(); 6553 SDValue Vec = Op.getNode()->getOperand(0); 6554 SDValue SubVec = Op.getNode()->getOperand(1); 6555 SDValue Idx = Op.getNode()->getOperand(2); 6556 6557 if (Op.getNode()->getValueType(0).getSizeInBits() == 256 6558 && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { 6559 return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); 6560 } 6561 } 6562 return SDValue(); 6563} 6564 6565// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 6566// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 6567// one of the above mentioned nodes. It has to be wrapped because otherwise 6568// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 6569// be used to form addressing mode. These wrapped nodes will be selected 6570// into MOV32ri. 6571SDValue 6572X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 6573 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 6574 6575 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6576 // global base reg. 6577 unsigned char OpFlag = 0; 6578 unsigned WrapperKind = X86ISD::Wrapper; 6579 CodeModel::Model M = getTargetMachine().getCodeModel(); 6580 6581 if (Subtarget->isPICStyleRIPRel() && 6582 (M == CodeModel::Small || M == CodeModel::Kernel)) 6583 WrapperKind = X86ISD::WrapperRIP; 6584 else if (Subtarget->isPICStyleGOT()) 6585 OpFlag = X86II::MO_GOTOFF; 6586 else if (Subtarget->isPICStyleStubPIC()) 6587 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6588 6589 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 6590 CP->getAlignment(), 6591 CP->getOffset(), OpFlag); 6592 DebugLoc DL = CP->getDebugLoc(); 6593 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6594 // With PIC, the address is actually $g + Offset. 6595 if (OpFlag) { 6596 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6597 DAG.getNode(X86ISD::GlobalBaseReg, 6598 DebugLoc(), getPointerTy()), 6599 Result); 6600 } 6601 6602 return Result; 6603} 6604 6605SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 6606 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 6607 6608 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6609 // global base reg. 6610 unsigned char OpFlag = 0; 6611 unsigned WrapperKind = X86ISD::Wrapper; 6612 CodeModel::Model M = getTargetMachine().getCodeModel(); 6613 6614 if (Subtarget->isPICStyleRIPRel() && 6615 (M == CodeModel::Small || M == CodeModel::Kernel)) 6616 WrapperKind = X86ISD::WrapperRIP; 6617 else if (Subtarget->isPICStyleGOT()) 6618 OpFlag = X86II::MO_GOTOFF; 6619 else if (Subtarget->isPICStyleStubPIC()) 6620 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6621 6622 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 6623 OpFlag); 6624 DebugLoc DL = JT->getDebugLoc(); 6625 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6626 6627 // With PIC, the address is actually $g + Offset. 6628 if (OpFlag) 6629 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6630 DAG.getNode(X86ISD::GlobalBaseReg, 6631 DebugLoc(), getPointerTy()), 6632 Result); 6633 6634 return Result; 6635} 6636 6637SDValue 6638X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 6639 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 6640 6641 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6642 // global base reg. 6643 unsigned char OpFlag = 0; 6644 unsigned WrapperKind = X86ISD::Wrapper; 6645 CodeModel::Model M = getTargetMachine().getCodeModel(); 6646 6647 if (Subtarget->isPICStyleRIPRel() && 6648 (M == CodeModel::Small || M == CodeModel::Kernel)) 6649 WrapperKind = X86ISD::WrapperRIP; 6650 else if (Subtarget->isPICStyleGOT()) 6651 OpFlag = X86II::MO_GOTOFF; 6652 else if (Subtarget->isPICStyleStubPIC()) 6653 OpFlag = X86II::MO_PIC_BASE_OFFSET; 6654 6655 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 6656 6657 DebugLoc DL = Op.getDebugLoc(); 6658 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6659 6660 6661 // With PIC, the address is actually $g + Offset. 6662 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 6663 !Subtarget->is64Bit()) { 6664 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6665 DAG.getNode(X86ISD::GlobalBaseReg, 6666 DebugLoc(), getPointerTy()), 6667 Result); 6668 } 6669 6670 return Result; 6671} 6672 6673SDValue 6674X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 6675 // Create the TargetBlockAddressAddress node. 6676 unsigned char OpFlags = 6677 Subtarget->ClassifyBlockAddressReference(); 6678 CodeModel::Model M = getTargetMachine().getCodeModel(); 6679 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 6680 DebugLoc dl = Op.getDebugLoc(); 6681 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 6682 /*isTarget=*/true, OpFlags); 6683 6684 if (Subtarget->isPICStyleRIPRel() && 6685 (M == CodeModel::Small || M == CodeModel::Kernel)) 6686 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6687 else 6688 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6689 6690 // With PIC, the address is actually $g + Offset. 6691 if (isGlobalRelativeToPICBase(OpFlags)) { 6692 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6693 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6694 Result); 6695 } 6696 6697 return Result; 6698} 6699 6700SDValue 6701X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 6702 int64_t Offset, 6703 SelectionDAG &DAG) const { 6704 // Create the TargetGlobalAddress node, folding in the constant 6705 // offset if it is legal. 6706 unsigned char OpFlags = 6707 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 6708 CodeModel::Model M = getTargetMachine().getCodeModel(); 6709 SDValue Result; 6710 if (OpFlags == X86II::MO_NO_FLAG && 6711 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6712 // A direct static reference to a global. 6713 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6714 Offset = 0; 6715 } else { 6716 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6717 } 6718 6719 if (Subtarget->isPICStyleRIPRel() && 6720 (M == CodeModel::Small || M == CodeModel::Kernel)) 6721 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6722 else 6723 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6724 6725 // With PIC, the address is actually $g + Offset. 6726 if (isGlobalRelativeToPICBase(OpFlags)) { 6727 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6728 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6729 Result); 6730 } 6731 6732 // For globals that require a load from a stub to get the address, emit the 6733 // load. 6734 if (isGlobalStubReference(OpFlags)) 6735 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6736 MachinePointerInfo::getGOT(), false, false, 0); 6737 6738 // If there was a non-zero offset that we didn't fold, create an explicit 6739 // addition for it. 6740 if (Offset != 0) 6741 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6742 DAG.getConstant(Offset, getPointerTy())); 6743 6744 return Result; 6745} 6746 6747SDValue 6748X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6749 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6750 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6751 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6752} 6753 6754static SDValue 6755GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6756 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6757 unsigned char OperandFlags) { 6758 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6759 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6760 DebugLoc dl = GA->getDebugLoc(); 6761 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6762 GA->getValueType(0), 6763 GA->getOffset(), 6764 OperandFlags); 6765 if (InFlag) { 6766 SDValue Ops[] = { Chain, TGA, *InFlag }; 6767 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6768 } else { 6769 SDValue Ops[] = { Chain, TGA }; 6770 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6771 } 6772 6773 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6774 MFI->setAdjustsStack(true); 6775 6776 SDValue Flag = Chain.getValue(1); 6777 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6778} 6779 6780// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6781static SDValue 6782LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6783 const EVT PtrVT) { 6784 SDValue InFlag; 6785 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6786 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6787 DAG.getNode(X86ISD::GlobalBaseReg, 6788 DebugLoc(), PtrVT), InFlag); 6789 InFlag = Chain.getValue(1); 6790 6791 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6792} 6793 6794// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6795static SDValue 6796LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6797 const EVT PtrVT) { 6798 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6799 X86::RAX, X86II::MO_TLSGD); 6800} 6801 6802// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6803// "local exec" model. 6804static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6805 const EVT PtrVT, TLSModel::Model model, 6806 bool is64Bit) { 6807 DebugLoc dl = GA->getDebugLoc(); 6808 6809 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6810 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6811 is64Bit ? 257 : 256)); 6812 6813 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6814 DAG.getIntPtrConstant(0), 6815 MachinePointerInfo(Ptr), false, false, 0); 6816 6817 unsigned char OperandFlags = 0; 6818 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6819 // initialexec. 6820 unsigned WrapperKind = X86ISD::Wrapper; 6821 if (model == TLSModel::LocalExec) { 6822 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6823 } else if (is64Bit) { 6824 assert(model == TLSModel::InitialExec); 6825 OperandFlags = X86II::MO_GOTTPOFF; 6826 WrapperKind = X86ISD::WrapperRIP; 6827 } else { 6828 assert(model == TLSModel::InitialExec); 6829 OperandFlags = X86II::MO_INDNTPOFF; 6830 } 6831 6832 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6833 // exec) 6834 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6835 GA->getValueType(0), 6836 GA->getOffset(), OperandFlags); 6837 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6838 6839 if (model == TLSModel::InitialExec) 6840 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6841 MachinePointerInfo::getGOT(), false, false, 0); 6842 6843 // The address of the thread local variable is the add of the thread 6844 // pointer with the offset of the variable. 6845 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6846} 6847 6848SDValue 6849X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6850 6851 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6852 const GlobalValue *GV = GA->getGlobal(); 6853 6854 if (Subtarget->isTargetELF()) { 6855 // TODO: implement the "local dynamic" model 6856 // TODO: implement the "initial exec"model for pic executables 6857 6858 // If GV is an alias then use the aliasee for determining 6859 // thread-localness. 6860 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6861 GV = GA->resolveAliasedGlobal(false); 6862 6863 TLSModel::Model model 6864 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6865 6866 switch (model) { 6867 case TLSModel::GeneralDynamic: 6868 case TLSModel::LocalDynamic: // not implemented 6869 if (Subtarget->is64Bit()) 6870 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6871 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6872 6873 case TLSModel::InitialExec: 6874 case TLSModel::LocalExec: 6875 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6876 Subtarget->is64Bit()); 6877 } 6878 } else if (Subtarget->isTargetDarwin()) { 6879 // Darwin only has one model of TLS. Lower to that. 6880 unsigned char OpFlag = 0; 6881 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6882 X86ISD::WrapperRIP : X86ISD::Wrapper; 6883 6884 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6885 // global base reg. 6886 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6887 !Subtarget->is64Bit(); 6888 if (PIC32) 6889 OpFlag = X86II::MO_TLVP_PIC_BASE; 6890 else 6891 OpFlag = X86II::MO_TLVP; 6892 DebugLoc DL = Op.getDebugLoc(); 6893 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6894 GA->getValueType(0), 6895 GA->getOffset(), OpFlag); 6896 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6897 6898 // With PIC32, the address is actually $g + Offset. 6899 if (PIC32) 6900 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6901 DAG.getNode(X86ISD::GlobalBaseReg, 6902 DebugLoc(), getPointerTy()), 6903 Offset); 6904 6905 // Lowering the machine isd will make sure everything is in the right 6906 // location. 6907 SDValue Chain = DAG.getEntryNode(); 6908 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 6909 SDValue Args[] = { Chain, Offset }; 6910 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); 6911 6912 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6913 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6914 MFI->setAdjustsStack(true); 6915 6916 // And our return value (tls address) is in the standard call return value 6917 // location. 6918 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6919 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6920 } 6921 6922 assert(false && 6923 "TLS not implemented for this target."); 6924 6925 llvm_unreachable("Unreachable"); 6926 return SDValue(); 6927} 6928 6929 6930/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and 6931/// take a 2 x i32 value to shift plus a shift amount. 6932SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { 6933 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6934 EVT VT = Op.getValueType(); 6935 unsigned VTBits = VT.getSizeInBits(); 6936 DebugLoc dl = Op.getDebugLoc(); 6937 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6938 SDValue ShOpLo = Op.getOperand(0); 6939 SDValue ShOpHi = Op.getOperand(1); 6940 SDValue ShAmt = Op.getOperand(2); 6941 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6942 DAG.getConstant(VTBits - 1, MVT::i8)) 6943 : DAG.getConstant(0, VT); 6944 6945 SDValue Tmp2, Tmp3; 6946 if (Op.getOpcode() == ISD::SHL_PARTS) { 6947 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6948 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6949 } else { 6950 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6951 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6952 } 6953 6954 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6955 DAG.getConstant(VTBits, MVT::i8)); 6956 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6957 AndNode, DAG.getConstant(0, MVT::i8)); 6958 6959 SDValue Hi, Lo; 6960 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6961 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6962 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6963 6964 if (Op.getOpcode() == ISD::SHL_PARTS) { 6965 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6966 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6967 } else { 6968 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6969 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6970 } 6971 6972 SDValue Ops[2] = { Lo, Hi }; 6973 return DAG.getMergeValues(Ops, 2, dl); 6974} 6975 6976SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6977 SelectionDAG &DAG) const { 6978 EVT SrcVT = Op.getOperand(0).getValueType(); 6979 6980 if (SrcVT.isVector()) 6981 return SDValue(); 6982 6983 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6984 "Unknown SINT_TO_FP to lower!"); 6985 6986 // These are really Legal; return the operand so the caller accepts it as 6987 // Legal. 6988 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6989 return Op; 6990 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6991 Subtarget->is64Bit()) { 6992 return Op; 6993 } 6994 6995 DebugLoc dl = Op.getDebugLoc(); 6996 unsigned Size = SrcVT.getSizeInBits()/8; 6997 MachineFunction &MF = DAG.getMachineFunction(); 6998 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6999 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7000 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7001 StackSlot, 7002 MachinePointerInfo::getFixedStack(SSFI), 7003 false, false, 0); 7004 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 7005} 7006 7007SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 7008 SDValue StackSlot, 7009 SelectionDAG &DAG) const { 7010 // Build the FILD 7011 DebugLoc DL = Op.getDebugLoc(); 7012 SDVTList Tys; 7013 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 7014 if (useSSE) 7015 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); 7016 else 7017 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 7018 7019 unsigned ByteSize = SrcVT.getSizeInBits()/8; 7020 7021 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); 7022 MachineMemOperand *MMO; 7023 if (FI) { 7024 int SSFI = FI->getIndex(); 7025 MMO = 7026 DAG.getMachineFunction() 7027 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7028 MachineMemOperand::MOLoad, ByteSize, ByteSize); 7029 } else { 7030 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); 7031 StackSlot = StackSlot.getOperand(1); 7032 } 7033 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 7034 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 7035 X86ISD::FILD, DL, 7036 Tys, Ops, array_lengthof(Ops), 7037 SrcVT, MMO); 7038 7039 if (useSSE) { 7040 Chain = Result.getValue(1); 7041 SDValue InFlag = Result.getValue(2); 7042 7043 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 7044 // shouldn't be necessary except that RFP cannot be live across 7045 // multiple blocks. When stackifier is fixed, they can be uncoupled. 7046 MachineFunction &MF = DAG.getMachineFunction(); 7047 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 7048 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 7049 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7050 Tys = DAG.getVTList(MVT::Other); 7051 SDValue Ops[] = { 7052 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 7053 }; 7054 MachineMemOperand *MMO = 7055 DAG.getMachineFunction() 7056 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7057 MachineMemOperand::MOStore, SSFISize, SSFISize); 7058 7059 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 7060 Ops, array_lengthof(Ops), 7061 Op.getValueType(), MMO); 7062 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 7063 MachinePointerInfo::getFixedStack(SSFI), 7064 false, false, 0); 7065 } 7066 7067 return Result; 7068} 7069 7070// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 7071SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 7072 SelectionDAG &DAG) const { 7073 // This algorithm is not obvious. Here it is in C code, more or less: 7074 /* 7075 double uint64_to_double( uint32_t hi, uint32_t lo ) { 7076 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 7077 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 7078 7079 // Copy ints to xmm registers. 7080 __m128i xh = _mm_cvtsi32_si128( hi ); 7081 __m128i xl = _mm_cvtsi32_si128( lo ); 7082 7083 // Combine into low half of a single xmm register. 7084 __m128i x = _mm_unpacklo_epi32( xh, xl ); 7085 __m128d d; 7086 double sd; 7087 7088 // Merge in appropriate exponents to give the integer bits the right 7089 // magnitude. 7090 x = _mm_unpacklo_epi32( x, exp ); 7091 7092 // Subtract away the biases to deal with the IEEE-754 double precision 7093 // implicit 1. 7094 d = _mm_sub_pd( (__m128d) x, bias ); 7095 7096 // All conversions up to here are exact. The correctly rounded result is 7097 // calculated using the current rounding mode using the following 7098 // horizontal add. 7099 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 7100 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 7101 // store doesn't really need to be here (except 7102 // maybe to zero the other double) 7103 return sd; 7104 } 7105 */ 7106 7107 DebugLoc dl = Op.getDebugLoc(); 7108 LLVMContext *Context = DAG.getContext(); 7109 7110 // Build some magic constants. 7111 std::vector<Constant*> CV0; 7112 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 7113 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 7114 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7115 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 7116 Constant *C0 = ConstantVector::get(CV0); 7117 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 7118 7119 std::vector<Constant*> CV1; 7120 CV1.push_back( 7121 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 7122 CV1.push_back( 7123 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 7124 Constant *C1 = ConstantVector::get(CV1); 7125 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 7126 7127 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7128 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7129 Op.getOperand(0), 7130 DAG.getIntPtrConstant(1))); 7131 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7132 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7133 Op.getOperand(0), 7134 DAG.getIntPtrConstant(0))); 7135 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 7136 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 7137 MachinePointerInfo::getConstantPool(), 7138 false, false, 16); 7139 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 7140 SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); 7141 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 7142 MachinePointerInfo::getConstantPool(), 7143 false, false, 16); 7144 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 7145 7146 // Add the halves; easiest way is to swap them into another reg first. 7147 int ShufMask[2] = { 1, -1 }; 7148 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 7149 DAG.getUNDEF(MVT::v2f64), ShufMask); 7150 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 7151 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 7152 DAG.getIntPtrConstant(0)); 7153} 7154 7155// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 7156SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 7157 SelectionDAG &DAG) const { 7158 DebugLoc dl = Op.getDebugLoc(); 7159 // FP constant to bias correct the final result. 7160 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 7161 MVT::f64); 7162 7163 // Load the 32-bit value into an XMM register. 7164 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 7165 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7166 Op.getOperand(0), 7167 DAG.getIntPtrConstant(0))); 7168 7169 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7170 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), 7171 DAG.getIntPtrConstant(0)); 7172 7173 // Or the load with the bias. 7174 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 7175 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7176 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7177 MVT::v2f64, Load)), 7178 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7179 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 7180 MVT::v2f64, Bias))); 7181 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 7182 DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), 7183 DAG.getIntPtrConstant(0)); 7184 7185 // Subtract the bias. 7186 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 7187 7188 // Handle final rounding. 7189 EVT DestVT = Op.getValueType(); 7190 7191 if (DestVT.bitsLT(MVT::f64)) { 7192 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 7193 DAG.getIntPtrConstant(0)); 7194 } else if (DestVT.bitsGT(MVT::f64)) { 7195 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 7196 } 7197 7198 // Handle final rounding. 7199 return Sub; 7200} 7201 7202SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 7203 SelectionDAG &DAG) const { 7204 SDValue N0 = Op.getOperand(0); 7205 DebugLoc dl = Op.getDebugLoc(); 7206 7207 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 7208 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 7209 // the optimization here. 7210 if (DAG.SignBitIsZero(N0)) 7211 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 7212 7213 EVT SrcVT = N0.getValueType(); 7214 EVT DstVT = Op.getValueType(); 7215 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 7216 return LowerUINT_TO_FP_i64(Op, DAG); 7217 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 7218 return LowerUINT_TO_FP_i32(Op, DAG); 7219 7220 // Make a 64-bit buffer, and use it to build an FILD. 7221 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 7222 if (SrcVT == MVT::i32) { 7223 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 7224 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 7225 getPointerTy(), StackSlot, WordOff); 7226 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7227 StackSlot, MachinePointerInfo(), 7228 false, false, 0); 7229 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 7230 OffsetSlot, MachinePointerInfo(), 7231 false, false, 0); 7232 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 7233 return Fild; 7234 } 7235 7236 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 7237 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 7238 StackSlot, MachinePointerInfo(), 7239 false, false, 0); 7240 // For i64 source, we need to add the appropriate power of 2 if the input 7241 // was negative. This is the same as the optimization in 7242 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 7243 // we must be careful to do the computation in x87 extended precision, not 7244 // in SSE. (The generic code can't know it's OK to do this, or how to.) 7245 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 7246 MachineMemOperand *MMO = 7247 DAG.getMachineFunction() 7248 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7249 MachineMemOperand::MOLoad, 8, 8); 7250 7251 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 7252 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 7253 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 7254 MVT::i64, MMO); 7255 7256 APInt FF(32, 0x5F800000ULL); 7257 7258 // Check whether the sign bit is set. 7259 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 7260 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 7261 ISD::SETLT); 7262 7263 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 7264 SDValue FudgePtr = DAG.getConstantPool( 7265 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 7266 getPointerTy()); 7267 7268 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 7269 SDValue Zero = DAG.getIntPtrConstant(0); 7270 SDValue Four = DAG.getIntPtrConstant(4); 7271 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 7272 Zero, Four); 7273 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 7274 7275 // Load the value out, extending it from f32 to f80. 7276 // FIXME: Avoid the extend by constructing the right constant pool? 7277 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 7278 FudgePtr, MachinePointerInfo::getConstantPool(), 7279 MVT::f32, false, false, 4); 7280 // Extend everything to 80 bits to force it to be done on x87. 7281 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 7282 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 7283} 7284 7285std::pair<SDValue,SDValue> X86TargetLowering:: 7286FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 7287 DebugLoc DL = Op.getDebugLoc(); 7288 7289 EVT DstTy = Op.getValueType(); 7290 7291 if (!IsSigned) { 7292 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 7293 DstTy = MVT::i64; 7294 } 7295 7296 assert(DstTy.getSimpleVT() <= MVT::i64 && 7297 DstTy.getSimpleVT() >= MVT::i16 && 7298 "Unknown FP_TO_SINT to lower!"); 7299 7300 // These are really Legal. 7301 if (DstTy == MVT::i32 && 7302 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7303 return std::make_pair(SDValue(), SDValue()); 7304 if (Subtarget->is64Bit() && 7305 DstTy == MVT::i64 && 7306 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 7307 return std::make_pair(SDValue(), SDValue()); 7308 7309 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 7310 // stack slot. 7311 MachineFunction &MF = DAG.getMachineFunction(); 7312 unsigned MemSize = DstTy.getSizeInBits()/8; 7313 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7314 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7315 7316 7317 7318 unsigned Opc; 7319 switch (DstTy.getSimpleVT().SimpleTy) { 7320 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 7321 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 7322 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 7323 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 7324 } 7325 7326 SDValue Chain = DAG.getEntryNode(); 7327 SDValue Value = Op.getOperand(0); 7328 EVT TheVT = Op.getOperand(0).getValueType(); 7329 if (isScalarFPTypeInSSEReg(TheVT)) { 7330 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 7331 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 7332 MachinePointerInfo::getFixedStack(SSFI), 7333 false, false, 0); 7334 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 7335 SDValue Ops[] = { 7336 Chain, StackSlot, DAG.getValueType(TheVT) 7337 }; 7338 7339 MachineMemOperand *MMO = 7340 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7341 MachineMemOperand::MOLoad, MemSize, MemSize); 7342 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 7343 DstTy, MMO); 7344 Chain = Value.getValue(1); 7345 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 7346 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7347 } 7348 7349 MachineMemOperand *MMO = 7350 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 7351 MachineMemOperand::MOStore, MemSize, MemSize); 7352 7353 // Build the FP_TO_INT*_IN_MEM 7354 SDValue Ops[] = { Chain, Value, StackSlot }; 7355 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 7356 Ops, 3, DstTy, MMO); 7357 7358 return std::make_pair(FIST, StackSlot); 7359} 7360 7361SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 7362 SelectionDAG &DAG) const { 7363 if (Op.getValueType().isVector()) 7364 return SDValue(); 7365 7366 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 7367 SDValue FIST = Vals.first, StackSlot = Vals.second; 7368 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 7369 if (FIST.getNode() == 0) return Op; 7370 7371 // Load the result. 7372 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7373 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7374} 7375 7376SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 7377 SelectionDAG &DAG) const { 7378 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 7379 SDValue FIST = Vals.first, StackSlot = Vals.second; 7380 assert(FIST.getNode() && "Unexpected failure"); 7381 7382 // Load the result. 7383 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 7384 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 7385} 7386 7387SDValue X86TargetLowering::LowerFABS(SDValue Op, 7388 SelectionDAG &DAG) const { 7389 LLVMContext *Context = DAG.getContext(); 7390 DebugLoc dl = Op.getDebugLoc(); 7391 EVT VT = Op.getValueType(); 7392 EVT EltVT = VT; 7393 if (VT.isVector()) 7394 EltVT = VT.getVectorElementType(); 7395 std::vector<Constant*> CV; 7396 if (EltVT == MVT::f64) { 7397 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 7398 CV.push_back(C); 7399 CV.push_back(C); 7400 } else { 7401 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 7402 CV.push_back(C); 7403 CV.push_back(C); 7404 CV.push_back(C); 7405 CV.push_back(C); 7406 } 7407 Constant *C = ConstantVector::get(CV); 7408 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7409 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7410 MachinePointerInfo::getConstantPool(), 7411 false, false, 16); 7412 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 7413} 7414 7415SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 7416 LLVMContext *Context = DAG.getContext(); 7417 DebugLoc dl = Op.getDebugLoc(); 7418 EVT VT = Op.getValueType(); 7419 EVT EltVT = VT; 7420 if (VT.isVector()) 7421 EltVT = VT.getVectorElementType(); 7422 std::vector<Constant*> CV; 7423 if (EltVT == MVT::f64) { 7424 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 7425 CV.push_back(C); 7426 CV.push_back(C); 7427 } else { 7428 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 7429 CV.push_back(C); 7430 CV.push_back(C); 7431 CV.push_back(C); 7432 CV.push_back(C); 7433 } 7434 Constant *C = ConstantVector::get(CV); 7435 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7436 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7437 MachinePointerInfo::getConstantPool(), 7438 false, false, 16); 7439 if (VT.isVector()) { 7440 return DAG.getNode(ISD::BITCAST, dl, VT, 7441 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 7442 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, 7443 Op.getOperand(0)), 7444 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); 7445 } else { 7446 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 7447 } 7448} 7449 7450SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 7451 LLVMContext *Context = DAG.getContext(); 7452 SDValue Op0 = Op.getOperand(0); 7453 SDValue Op1 = Op.getOperand(1); 7454 DebugLoc dl = Op.getDebugLoc(); 7455 EVT VT = Op.getValueType(); 7456 EVT SrcVT = Op1.getValueType(); 7457 7458 // If second operand is smaller, extend it first. 7459 if (SrcVT.bitsLT(VT)) { 7460 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 7461 SrcVT = VT; 7462 } 7463 // And if it is bigger, shrink it first. 7464 if (SrcVT.bitsGT(VT)) { 7465 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 7466 SrcVT = VT; 7467 } 7468 7469 // At this point the operands and the result should have the same 7470 // type, and that won't be f80 since that is not custom lowered. 7471 7472 // First get the sign bit of second operand. 7473 std::vector<Constant*> CV; 7474 if (SrcVT == MVT::f64) { 7475 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 7476 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7477 } else { 7478 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 7479 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7480 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7481 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7482 } 7483 Constant *C = ConstantVector::get(CV); 7484 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7485 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 7486 MachinePointerInfo::getConstantPool(), 7487 false, false, 16); 7488 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 7489 7490 // Shift sign bit right or left if the two operands have different types. 7491 if (SrcVT.bitsGT(VT)) { 7492 // Op0 is MVT::f32, Op1 is MVT::f64. 7493 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 7494 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 7495 DAG.getConstant(32, MVT::i32)); 7496 SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); 7497 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 7498 DAG.getIntPtrConstant(0)); 7499 } 7500 7501 // Clear first operand sign bit. 7502 CV.clear(); 7503 if (VT == MVT::f64) { 7504 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 7505 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 7506 } else { 7507 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 7508 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7509 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7510 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 7511 } 7512 C = ConstantVector::get(CV); 7513 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7514 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7515 MachinePointerInfo::getConstantPool(), 7516 false, false, 16); 7517 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 7518 7519 // Or the value with the sign bit. 7520 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 7521} 7522 7523SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { 7524 SDValue N0 = Op.getOperand(0); 7525 DebugLoc dl = Op.getDebugLoc(); 7526 EVT VT = Op.getValueType(); 7527 7528 // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). 7529 SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, 7530 DAG.getConstant(1, VT)); 7531 return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); 7532} 7533 7534/// Emit nodes that will be selected as "test Op0,Op0", or something 7535/// equivalent. 7536SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 7537 SelectionDAG &DAG) const { 7538 DebugLoc dl = Op.getDebugLoc(); 7539 7540 // CF and OF aren't always set the way we want. Determine which 7541 // of these we need. 7542 bool NeedCF = false; 7543 bool NeedOF = false; 7544 switch (X86CC) { 7545 default: break; 7546 case X86::COND_A: case X86::COND_AE: 7547 case X86::COND_B: case X86::COND_BE: 7548 NeedCF = true; 7549 break; 7550 case X86::COND_G: case X86::COND_GE: 7551 case X86::COND_L: case X86::COND_LE: 7552 case X86::COND_O: case X86::COND_NO: 7553 NeedOF = true; 7554 break; 7555 } 7556 7557 // See if we can use the EFLAGS value from the operand instead of 7558 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 7559 // we prove that the arithmetic won't overflow, we can't use OF or CF. 7560 if (Op.getResNo() != 0 || NeedOF || NeedCF) 7561 // Emit a CMP with 0, which is the TEST pattern. 7562 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7563 DAG.getConstant(0, Op.getValueType())); 7564 7565 unsigned Opcode = 0; 7566 unsigned NumOperands = 0; 7567 switch (Op.getNode()->getOpcode()) { 7568 case ISD::ADD: 7569 // Due to an isel shortcoming, be conservative if this add is likely to be 7570 // selected as part of a load-modify-store instruction. When the root node 7571 // in a match is a store, isel doesn't know how to remap non-chain non-flag 7572 // uses of other nodes in the match, such as the ADD in this case. This 7573 // leads to the ADD being left around and reselected, with the result being 7574 // two adds in the output. Alas, even if none our users are stores, that 7575 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 7576 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 7577 // climbing the DAG back to the root, and it doesn't seem to be worth the 7578 // effort. 7579 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7580 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7581 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 7582 goto default_case; 7583 7584 if (ConstantSDNode *C = 7585 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 7586 // An add of one will be selected as an INC. 7587 if (C->getAPIntValue() == 1) { 7588 Opcode = X86ISD::INC; 7589 NumOperands = 1; 7590 break; 7591 } 7592 7593 // An add of negative one (subtract of one) will be selected as a DEC. 7594 if (C->getAPIntValue().isAllOnesValue()) { 7595 Opcode = X86ISD::DEC; 7596 NumOperands = 1; 7597 break; 7598 } 7599 } 7600 7601 // Otherwise use a regular EFLAGS-setting add. 7602 Opcode = X86ISD::ADD; 7603 NumOperands = 2; 7604 break; 7605 case ISD::AND: { 7606 // If the primary and result isn't used, don't bother using X86ISD::AND, 7607 // because a TEST instruction will be better. 7608 bool NonFlagUse = false; 7609 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7610 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 7611 SDNode *User = *UI; 7612 unsigned UOpNo = UI.getOperandNo(); 7613 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 7614 // Look pass truncate. 7615 UOpNo = User->use_begin().getOperandNo(); 7616 User = *User->use_begin(); 7617 } 7618 7619 if (User->getOpcode() != ISD::BRCOND && 7620 User->getOpcode() != ISD::SETCC && 7621 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 7622 NonFlagUse = true; 7623 break; 7624 } 7625 } 7626 7627 if (!NonFlagUse) 7628 break; 7629 } 7630 // FALL THROUGH 7631 case ISD::SUB: 7632 case ISD::OR: 7633 case ISD::XOR: 7634 // Due to the ISEL shortcoming noted above, be conservative if this op is 7635 // likely to be selected as part of a load-modify-store instruction. 7636 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 7637 UE = Op.getNode()->use_end(); UI != UE; ++UI) 7638 if (UI->getOpcode() == ISD::STORE) 7639 goto default_case; 7640 7641 // Otherwise use a regular EFLAGS-setting instruction. 7642 switch (Op.getNode()->getOpcode()) { 7643 default: llvm_unreachable("unexpected operator!"); 7644 case ISD::SUB: Opcode = X86ISD::SUB; break; 7645 case ISD::OR: Opcode = X86ISD::OR; break; 7646 case ISD::XOR: Opcode = X86ISD::XOR; break; 7647 case ISD::AND: Opcode = X86ISD::AND; break; 7648 } 7649 7650 NumOperands = 2; 7651 break; 7652 case X86ISD::ADD: 7653 case X86ISD::SUB: 7654 case X86ISD::INC: 7655 case X86ISD::DEC: 7656 case X86ISD::OR: 7657 case X86ISD::XOR: 7658 case X86ISD::AND: 7659 return SDValue(Op.getNode(), 1); 7660 default: 7661 default_case: 7662 break; 7663 } 7664 7665 if (Opcode == 0) 7666 // Emit a CMP with 0, which is the TEST pattern. 7667 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 7668 DAG.getConstant(0, Op.getValueType())); 7669 7670 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 7671 SmallVector<SDValue, 4> Ops; 7672 for (unsigned i = 0; i != NumOperands; ++i) 7673 Ops.push_back(Op.getOperand(i)); 7674 7675 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 7676 DAG.ReplaceAllUsesWith(Op, New); 7677 return SDValue(New.getNode(), 1); 7678} 7679 7680/// Emit nodes that will be selected as "cmp Op0,Op1", or something 7681/// equivalent. 7682SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 7683 SelectionDAG &DAG) const { 7684 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 7685 if (C->getAPIntValue() == 0) 7686 return EmitTest(Op0, X86CC, DAG); 7687 7688 DebugLoc dl = Op0.getDebugLoc(); 7689 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 7690} 7691 7692/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 7693/// if it's possible. 7694SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 7695 DebugLoc dl, SelectionDAG &DAG) const { 7696 SDValue Op0 = And.getOperand(0); 7697 SDValue Op1 = And.getOperand(1); 7698 if (Op0.getOpcode() == ISD::TRUNCATE) 7699 Op0 = Op0.getOperand(0); 7700 if (Op1.getOpcode() == ISD::TRUNCATE) 7701 Op1 = Op1.getOperand(0); 7702 7703 SDValue LHS, RHS; 7704 if (Op1.getOpcode() == ISD::SHL) 7705 std::swap(Op0, Op1); 7706 if (Op0.getOpcode() == ISD::SHL) { 7707 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 7708 if (And00C->getZExtValue() == 1) { 7709 // If we looked past a truncate, check that it's only truncating away 7710 // known zeros. 7711 unsigned BitWidth = Op0.getValueSizeInBits(); 7712 unsigned AndBitWidth = And.getValueSizeInBits(); 7713 if (BitWidth > AndBitWidth) { 7714 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 7715 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 7716 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 7717 return SDValue(); 7718 } 7719 LHS = Op1; 7720 RHS = Op0.getOperand(1); 7721 } 7722 } else if (Op1.getOpcode() == ISD::Constant) { 7723 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 7724 SDValue AndLHS = Op0; 7725 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 7726 LHS = AndLHS.getOperand(0); 7727 RHS = AndLHS.getOperand(1); 7728 } 7729 } 7730 7731 if (LHS.getNode()) { 7732 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7733 // instruction. Since the shift amount is in-range-or-undefined, we know 7734 // that doing a bittest on the i32 value is ok. We extend to i32 because 7735 // the encoding for the i16 version is larger than the i32 version. 7736 // Also promote i16 to i32 for performance / code size reason. 7737 if (LHS.getValueType() == MVT::i8 || 7738 LHS.getValueType() == MVT::i16) 7739 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7740 7741 // If the operand types disagree, extend the shift amount to match. Since 7742 // BT ignores high bits (like shifts) we can use anyextend. 7743 if (LHS.getValueType() != RHS.getValueType()) 7744 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7745 7746 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7747 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7748 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7749 DAG.getConstant(Cond, MVT::i8), BT); 7750 } 7751 7752 return SDValue(); 7753} 7754 7755SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7756 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7757 SDValue Op0 = Op.getOperand(0); 7758 SDValue Op1 = Op.getOperand(1); 7759 DebugLoc dl = Op.getDebugLoc(); 7760 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7761 7762 // Optimize to BT if possible. 7763 // Lower (X & (1 << N)) == 0 to BT(X, N). 7764 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7765 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7766 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && 7767 Op1.getOpcode() == ISD::Constant && 7768 cast<ConstantSDNode>(Op1)->isNullValue() && 7769 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7770 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7771 if (NewSetCC.getNode()) 7772 return NewSetCC; 7773 } 7774 7775 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of 7776 // these. 7777 if (Op1.getOpcode() == ISD::Constant && 7778 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7779 cast<ConstantSDNode>(Op1)->isNullValue()) && 7780 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7781 7782 // If the input is a setcc, then reuse the input setcc or use a new one with 7783 // the inverted condition. 7784 if (Op0.getOpcode() == X86ISD::SETCC) { 7785 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7786 bool Invert = (CC == ISD::SETNE) ^ 7787 cast<ConstantSDNode>(Op1)->isNullValue(); 7788 if (!Invert) return Op0; 7789 7790 CCode = X86::GetOppositeBranchCondition(CCode); 7791 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7792 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7793 } 7794 } 7795 7796 bool isFP = Op1.getValueType().isFloatingPoint(); 7797 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7798 if (X86CC == X86::COND_INVALID) 7799 return SDValue(); 7800 7801 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); 7802 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7803 DAG.getConstant(X86CC, MVT::i8), EFLAGS); 7804} 7805 7806SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7807 SDValue Cond; 7808 SDValue Op0 = Op.getOperand(0); 7809 SDValue Op1 = Op.getOperand(1); 7810 SDValue CC = Op.getOperand(2); 7811 EVT VT = Op.getValueType(); 7812 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7813 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7814 DebugLoc dl = Op.getDebugLoc(); 7815 7816 if (isFP) { 7817 unsigned SSECC = 8; 7818 EVT VT0 = Op0.getValueType(); 7819 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7820 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7821 bool Swap = false; 7822 7823 switch (SetCCOpcode) { 7824 default: break; 7825 case ISD::SETOEQ: 7826 case ISD::SETEQ: SSECC = 0; break; 7827 case ISD::SETOGT: 7828 case ISD::SETGT: Swap = true; // Fallthrough 7829 case ISD::SETLT: 7830 case ISD::SETOLT: SSECC = 1; break; 7831 case ISD::SETOGE: 7832 case ISD::SETGE: Swap = true; // Fallthrough 7833 case ISD::SETLE: 7834 case ISD::SETOLE: SSECC = 2; break; 7835 case ISD::SETUO: SSECC = 3; break; 7836 case ISD::SETUNE: 7837 case ISD::SETNE: SSECC = 4; break; 7838 case ISD::SETULE: Swap = true; 7839 case ISD::SETUGE: SSECC = 5; break; 7840 case ISD::SETULT: Swap = true; 7841 case ISD::SETUGT: SSECC = 6; break; 7842 case ISD::SETO: SSECC = 7; break; 7843 } 7844 if (Swap) 7845 std::swap(Op0, Op1); 7846 7847 // In the two special cases we can't handle, emit two comparisons. 7848 if (SSECC == 8) { 7849 if (SetCCOpcode == ISD::SETUEQ) { 7850 SDValue UNORD, EQ; 7851 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7852 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7853 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7854 } 7855 else if (SetCCOpcode == ISD::SETONE) { 7856 SDValue ORD, NEQ; 7857 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7858 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7859 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7860 } 7861 llvm_unreachable("Illegal FP comparison"); 7862 } 7863 // Handle all other FP comparisons here. 7864 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7865 } 7866 7867 // We are handling one of the integer comparisons here. Since SSE only has 7868 // GT and EQ comparisons for integer, swapping operands and multiple 7869 // operations may be required for some comparisons. 7870 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7871 bool Swap = false, Invert = false, FlipSigns = false; 7872 7873 switch (VT.getSimpleVT().SimpleTy) { 7874 default: break; 7875 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7876 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7877 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7878 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7879 } 7880 7881 switch (SetCCOpcode) { 7882 default: break; 7883 case ISD::SETNE: Invert = true; 7884 case ISD::SETEQ: Opc = EQOpc; break; 7885 case ISD::SETLT: Swap = true; 7886 case ISD::SETGT: Opc = GTOpc; break; 7887 case ISD::SETGE: Swap = true; 7888 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7889 case ISD::SETULT: Swap = true; 7890 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7891 case ISD::SETUGE: Swap = true; 7892 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7893 } 7894 if (Swap) 7895 std::swap(Op0, Op1); 7896 7897 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7898 // bits of the inputs before performing those operations. 7899 if (FlipSigns) { 7900 EVT EltVT = VT.getVectorElementType(); 7901 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7902 EltVT); 7903 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7904 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7905 SignBits.size()); 7906 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7907 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7908 } 7909 7910 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7911 7912 // If the logical-not of the result is required, perform that now. 7913 if (Invert) 7914 Result = DAG.getNOT(dl, Result, VT); 7915 7916 return Result; 7917} 7918 7919// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7920static bool isX86LogicalCmp(SDValue Op) { 7921 unsigned Opc = Op.getNode()->getOpcode(); 7922 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7923 return true; 7924 if (Op.getResNo() == 1 && 7925 (Opc == X86ISD::ADD || 7926 Opc == X86ISD::SUB || 7927 Opc == X86ISD::ADC || 7928 Opc == X86ISD::SBB || 7929 Opc == X86ISD::SMUL || 7930 Opc == X86ISD::UMUL || 7931 Opc == X86ISD::INC || 7932 Opc == X86ISD::DEC || 7933 Opc == X86ISD::OR || 7934 Opc == X86ISD::XOR || 7935 Opc == X86ISD::AND)) 7936 return true; 7937 7938 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) 7939 return true; 7940 7941 return false; 7942} 7943 7944static bool isZero(SDValue V) { 7945 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7946 return C && C->isNullValue(); 7947} 7948 7949static bool isAllOnes(SDValue V) { 7950 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 7951 return C && C->isAllOnesValue(); 7952} 7953 7954SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7955 bool addTest = true; 7956 SDValue Cond = Op.getOperand(0); 7957 SDValue Op1 = Op.getOperand(1); 7958 SDValue Op2 = Op.getOperand(2); 7959 DebugLoc DL = Op.getDebugLoc(); 7960 SDValue CC; 7961 7962 if (Cond.getOpcode() == ISD::SETCC) { 7963 SDValue NewCond = LowerSETCC(Cond, DAG); 7964 if (NewCond.getNode()) 7965 Cond = NewCond; 7966 } 7967 7968 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y 7969 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y 7970 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y 7971 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y 7972 if (Cond.getOpcode() == X86ISD::SETCC && 7973 Cond.getOperand(1).getOpcode() == X86ISD::CMP && 7974 isZero(Cond.getOperand(1).getOperand(1))) { 7975 SDValue Cmp = Cond.getOperand(1); 7976 7977 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); 7978 7979 if ((isAllOnes(Op1) || isAllOnes(Op2)) && 7980 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { 7981 SDValue Y = isAllOnes(Op2) ? Op1 : Op2; 7982 7983 SDValue CmpOp0 = Cmp.getOperand(0); 7984 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, 7985 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7986 7987 SDValue Res = // Res = 0 or -1. 7988 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 7989 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7990 7991 if (isAllOnes(Op1) != (CondCode == X86::COND_E)) 7992 Res = DAG.getNOT(DL, Res, Res.getValueType()); 7993 7994 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7995 if (N2C == 0 || !N2C->isNullValue()) 7996 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); 7997 return Res; 7998 } 7999 } 8000 8001 // Look past (and (setcc_carry (cmp ...)), 1). 8002 if (Cond.getOpcode() == ISD::AND && 8003 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8004 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8005 if (C && C->getAPIntValue() == 1) 8006 Cond = Cond.getOperand(0); 8007 } 8008 8009 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8010 // setting operand in place of the X86ISD::SETCC. 8011 if (Cond.getOpcode() == X86ISD::SETCC || 8012 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8013 CC = Cond.getOperand(0); 8014 8015 SDValue Cmp = Cond.getOperand(1); 8016 unsigned Opc = Cmp.getOpcode(); 8017 EVT VT = Op.getValueType(); 8018 8019 bool IllegalFPCMov = false; 8020 if (VT.isFloatingPoint() && !VT.isVector() && 8021 !isScalarFPTypeInSSEReg(VT)) // FPStack? 8022 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 8023 8024 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 8025 Opc == X86ISD::BT) { // FIXME 8026 Cond = Cmp; 8027 addTest = false; 8028 } 8029 } 8030 8031 if (addTest) { 8032 // Look pass the truncate. 8033 if (Cond.getOpcode() == ISD::TRUNCATE) 8034 Cond = Cond.getOperand(0); 8035 8036 // We know the result of AND is compared against zero. Try to match 8037 // it to BT. 8038 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8039 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); 8040 if (NewSetCC.getNode()) { 8041 CC = NewSetCC.getOperand(0); 8042 Cond = NewSetCC.getOperand(1); 8043 addTest = false; 8044 } 8045 } 8046 } 8047 8048 if (addTest) { 8049 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8050 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8051 } 8052 8053 // a < b ? -1 : 0 -> RES = ~setcc_carry 8054 // a < b ? 0 : -1 -> RES = setcc_carry 8055 // a >= b ? -1 : 0 -> RES = setcc_carry 8056 // a >= b ? 0 : -1 -> RES = ~setcc_carry 8057 if (Cond.getOpcode() == X86ISD::CMP) { 8058 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); 8059 8060 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && 8061 (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { 8062 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), 8063 DAG.getConstant(X86::COND_B, MVT::i8), Cond); 8064 if (isAllOnes(Op1) != (CondCode == X86::COND_B)) 8065 return DAG.getNOT(DL, Res, Res.getValueType()); 8066 return Res; 8067 } 8068 } 8069 8070 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 8071 // condition is true. 8072 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); 8073 SDValue Ops[] = { Op2, Op1, CC, Cond }; 8074 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); 8075} 8076 8077// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 8078// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 8079// from the AND / OR. 8080static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 8081 Opc = Op.getOpcode(); 8082 if (Opc != ISD::OR && Opc != ISD::AND) 8083 return false; 8084 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8085 Op.getOperand(0).hasOneUse() && 8086 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 8087 Op.getOperand(1).hasOneUse()); 8088} 8089 8090// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 8091// 1 and that the SETCC node has a single use. 8092static bool isXor1OfSetCC(SDValue Op) { 8093 if (Op.getOpcode() != ISD::XOR) 8094 return false; 8095 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8096 if (N1C && N1C->getAPIntValue() == 1) { 8097 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 8098 Op.getOperand(0).hasOneUse(); 8099 } 8100 return false; 8101} 8102 8103SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 8104 bool addTest = true; 8105 SDValue Chain = Op.getOperand(0); 8106 SDValue Cond = Op.getOperand(1); 8107 SDValue Dest = Op.getOperand(2); 8108 DebugLoc dl = Op.getDebugLoc(); 8109 SDValue CC; 8110 8111 if (Cond.getOpcode() == ISD::SETCC) { 8112 SDValue NewCond = LowerSETCC(Cond, DAG); 8113 if (NewCond.getNode()) 8114 Cond = NewCond; 8115 } 8116#if 0 8117 // FIXME: LowerXALUO doesn't handle these!! 8118 else if (Cond.getOpcode() == X86ISD::ADD || 8119 Cond.getOpcode() == X86ISD::SUB || 8120 Cond.getOpcode() == X86ISD::SMUL || 8121 Cond.getOpcode() == X86ISD::UMUL) 8122 Cond = LowerXALUO(Cond, DAG); 8123#endif 8124 8125 // Look pass (and (setcc_carry (cmp ...)), 1). 8126 if (Cond.getOpcode() == ISD::AND && 8127 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 8128 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 8129 if (C && C->getAPIntValue() == 1) 8130 Cond = Cond.getOperand(0); 8131 } 8132 8133 // If condition flag is set by a X86ISD::CMP, then use it as the condition 8134 // setting operand in place of the X86ISD::SETCC. 8135 if (Cond.getOpcode() == X86ISD::SETCC || 8136 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 8137 CC = Cond.getOperand(0); 8138 8139 SDValue Cmp = Cond.getOperand(1); 8140 unsigned Opc = Cmp.getOpcode(); 8141 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 8142 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 8143 Cond = Cmp; 8144 addTest = false; 8145 } else { 8146 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 8147 default: break; 8148 case X86::COND_O: 8149 case X86::COND_B: 8150 // These can only come from an arithmetic instruction with overflow, 8151 // e.g. SADDO, UADDO. 8152 Cond = Cond.getNode()->getOperand(1); 8153 addTest = false; 8154 break; 8155 } 8156 } 8157 } else { 8158 unsigned CondOpc; 8159 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 8160 SDValue Cmp = Cond.getOperand(0).getOperand(1); 8161 if (CondOpc == ISD::OR) { 8162 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 8163 // two branches instead of an explicit OR instruction with a 8164 // separate test. 8165 if (Cmp == Cond.getOperand(1).getOperand(1) && 8166 isX86LogicalCmp(Cmp)) { 8167 CC = Cond.getOperand(0).getOperand(0); 8168 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8169 Chain, Dest, CC, Cmp); 8170 CC = Cond.getOperand(1).getOperand(0); 8171 Cond = Cmp; 8172 addTest = false; 8173 } 8174 } else { // ISD::AND 8175 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 8176 // two branches instead of an explicit AND instruction with a 8177 // separate test. However, we only do this if this block doesn't 8178 // have a fall-through edge, because this requires an explicit 8179 // jmp when the condition is false. 8180 if (Cmp == Cond.getOperand(1).getOperand(1) && 8181 isX86LogicalCmp(Cmp) && 8182 Op.getNode()->hasOneUse()) { 8183 X86::CondCode CCode = 8184 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8185 CCode = X86::GetOppositeBranchCondition(CCode); 8186 CC = DAG.getConstant(CCode, MVT::i8); 8187 SDNode *User = *Op.getNode()->use_begin(); 8188 // Look for an unconditional branch following this conditional branch. 8189 // We need this because we need to reverse the successors in order 8190 // to implement FCMP_OEQ. 8191 if (User->getOpcode() == ISD::BR) { 8192 SDValue FalseBB = User->getOperand(1); 8193 SDNode *NewBR = 8194 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 8195 assert(NewBR == User); 8196 (void)NewBR; 8197 Dest = FalseBB; 8198 8199 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8200 Chain, Dest, CC, Cmp); 8201 X86::CondCode CCode = 8202 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 8203 CCode = X86::GetOppositeBranchCondition(CCode); 8204 CC = DAG.getConstant(CCode, MVT::i8); 8205 Cond = Cmp; 8206 addTest = false; 8207 } 8208 } 8209 } 8210 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 8211 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 8212 // It should be transformed during dag combiner except when the condition 8213 // is set by a arithmetics with overflow node. 8214 X86::CondCode CCode = 8215 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 8216 CCode = X86::GetOppositeBranchCondition(CCode); 8217 CC = DAG.getConstant(CCode, MVT::i8); 8218 Cond = Cond.getOperand(0).getOperand(1); 8219 addTest = false; 8220 } 8221 } 8222 8223 if (addTest) { 8224 // Look pass the truncate. 8225 if (Cond.getOpcode() == ISD::TRUNCATE) 8226 Cond = Cond.getOperand(0); 8227 8228 // We know the result of AND is compared against zero. Try to match 8229 // it to BT. 8230 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 8231 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 8232 if (NewSetCC.getNode()) { 8233 CC = NewSetCC.getOperand(0); 8234 Cond = NewSetCC.getOperand(1); 8235 addTest = false; 8236 } 8237 } 8238 } 8239 8240 if (addTest) { 8241 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 8242 Cond = EmitTest(Cond, X86::COND_NE, DAG); 8243 } 8244 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 8245 Chain, Dest, CC, Cond); 8246} 8247 8248 8249// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 8250// Calls to _alloca is needed to probe the stack when allocating more than 4k 8251// bytes in one go. Touching the stack at 4K increments is necessary to ensure 8252// that the guard pages used by the OS virtual memory manager are allocated in 8253// correct sequence. 8254SDValue 8255X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 8256 SelectionDAG &DAG) const { 8257 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 8258 "This should be used only on Windows targets"); 8259 assert(!Subtarget->isTargetEnvMacho()); 8260 DebugLoc dl = Op.getDebugLoc(); 8261 8262 // Get the inputs. 8263 SDValue Chain = Op.getOperand(0); 8264 SDValue Size = Op.getOperand(1); 8265 // FIXME: Ensure alignment here 8266 8267 SDValue Flag; 8268 8269 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 8270 unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); 8271 8272 Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); 8273 Flag = Chain.getValue(1); 8274 8275 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 8276 8277 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 8278 Flag = Chain.getValue(1); 8279 8280 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 8281 8282 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 8283 return DAG.getMergeValues(Ops1, 2, dl); 8284} 8285 8286SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 8287 MachineFunction &MF = DAG.getMachineFunction(); 8288 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 8289 8290 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8291 DebugLoc DL = Op.getDebugLoc(); 8292 8293 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 8294 // vastart just stores the address of the VarArgsFrameIndex slot into the 8295 // memory location argument. 8296 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8297 getPointerTy()); 8298 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 8299 MachinePointerInfo(SV), false, false, 0); 8300 } 8301 8302 // __va_list_tag: 8303 // gp_offset (0 - 6 * 8) 8304 // fp_offset (48 - 48 + 8 * 16) 8305 // overflow_arg_area (point to parameters coming in memory). 8306 // reg_save_area 8307 SmallVector<SDValue, 8> MemOps; 8308 SDValue FIN = Op.getOperand(1); 8309 // Store gp_offset 8310 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 8311 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 8312 MVT::i32), 8313 FIN, MachinePointerInfo(SV), false, false, 0); 8314 MemOps.push_back(Store); 8315 8316 // Store fp_offset 8317 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8318 FIN, DAG.getIntPtrConstant(4)); 8319 Store = DAG.getStore(Op.getOperand(0), DL, 8320 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 8321 MVT::i32), 8322 FIN, MachinePointerInfo(SV, 4), false, false, 0); 8323 MemOps.push_back(Store); 8324 8325 // Store ptr to overflow_arg_area 8326 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8327 FIN, DAG.getIntPtrConstant(4)); 8328 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 8329 getPointerTy()); 8330 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 8331 MachinePointerInfo(SV, 8), 8332 false, false, 0); 8333 MemOps.push_back(Store); 8334 8335 // Store ptr to reg_save_area. 8336 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 8337 FIN, DAG.getIntPtrConstant(8)); 8338 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 8339 getPointerTy()); 8340 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 8341 MachinePointerInfo(SV, 16), false, false, 0); 8342 MemOps.push_back(Store); 8343 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 8344 &MemOps[0], MemOps.size()); 8345} 8346 8347SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 8348 assert(Subtarget->is64Bit() && 8349 "LowerVAARG only handles 64-bit va_arg!"); 8350 assert((Subtarget->isTargetLinux() || 8351 Subtarget->isTargetDarwin()) && 8352 "Unhandled target in LowerVAARG"); 8353 assert(Op.getNode()->getNumOperands() == 4); 8354 SDValue Chain = Op.getOperand(0); 8355 SDValue SrcPtr = Op.getOperand(1); 8356 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 8357 unsigned Align = Op.getConstantOperandVal(3); 8358 DebugLoc dl = Op.getDebugLoc(); 8359 8360 EVT ArgVT = Op.getNode()->getValueType(0); 8361 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 8362 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 8363 uint8_t ArgMode; 8364 8365 // Decide which area this value should be read from. 8366 // TODO: Implement the AMD64 ABI in its entirety. This simple 8367 // selection mechanism works only for the basic types. 8368 if (ArgVT == MVT::f80) { 8369 llvm_unreachable("va_arg for f80 not yet implemented"); 8370 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 8371 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 8372 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 8373 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 8374 } else { 8375 llvm_unreachable("Unhandled argument type in LowerVAARG"); 8376 } 8377 8378 if (ArgMode == 2) { 8379 // Sanity Check: Make sure using fp_offset makes sense. 8380 assert(!UseSoftFloat && 8381 !(DAG.getMachineFunction() 8382 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 8383 Subtarget->hasXMM()); 8384 } 8385 8386 // Insert VAARG_64 node into the DAG 8387 // VAARG_64 returns two values: Variable Argument Address, Chain 8388 SmallVector<SDValue, 11> InstOps; 8389 InstOps.push_back(Chain); 8390 InstOps.push_back(SrcPtr); 8391 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 8392 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 8393 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 8394 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 8395 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 8396 VTs, &InstOps[0], InstOps.size(), 8397 MVT::i64, 8398 MachinePointerInfo(SV), 8399 /*Align=*/0, 8400 /*Volatile=*/false, 8401 /*ReadMem=*/true, 8402 /*WriteMem=*/true); 8403 Chain = VAARG.getValue(1); 8404 8405 // Load the next argument and return it 8406 return DAG.getLoad(ArgVT, dl, 8407 Chain, 8408 VAARG, 8409 MachinePointerInfo(), 8410 false, false, 0); 8411} 8412 8413SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 8414 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 8415 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 8416 SDValue Chain = Op.getOperand(0); 8417 SDValue DstPtr = Op.getOperand(1); 8418 SDValue SrcPtr = Op.getOperand(2); 8419 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 8420 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8421 DebugLoc DL = Op.getDebugLoc(); 8422 8423 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 8424 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 8425 false, 8426 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 8427} 8428 8429SDValue 8430X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 8431 DebugLoc dl = Op.getDebugLoc(); 8432 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8433 switch (IntNo) { 8434 default: return SDValue(); // Don't custom lower most intrinsics. 8435 // Comparison intrinsics. 8436 case Intrinsic::x86_sse_comieq_ss: 8437 case Intrinsic::x86_sse_comilt_ss: 8438 case Intrinsic::x86_sse_comile_ss: 8439 case Intrinsic::x86_sse_comigt_ss: 8440 case Intrinsic::x86_sse_comige_ss: 8441 case Intrinsic::x86_sse_comineq_ss: 8442 case Intrinsic::x86_sse_ucomieq_ss: 8443 case Intrinsic::x86_sse_ucomilt_ss: 8444 case Intrinsic::x86_sse_ucomile_ss: 8445 case Intrinsic::x86_sse_ucomigt_ss: 8446 case Intrinsic::x86_sse_ucomige_ss: 8447 case Intrinsic::x86_sse_ucomineq_ss: 8448 case Intrinsic::x86_sse2_comieq_sd: 8449 case Intrinsic::x86_sse2_comilt_sd: 8450 case Intrinsic::x86_sse2_comile_sd: 8451 case Intrinsic::x86_sse2_comigt_sd: 8452 case Intrinsic::x86_sse2_comige_sd: 8453 case Intrinsic::x86_sse2_comineq_sd: 8454 case Intrinsic::x86_sse2_ucomieq_sd: 8455 case Intrinsic::x86_sse2_ucomilt_sd: 8456 case Intrinsic::x86_sse2_ucomile_sd: 8457 case Intrinsic::x86_sse2_ucomigt_sd: 8458 case Intrinsic::x86_sse2_ucomige_sd: 8459 case Intrinsic::x86_sse2_ucomineq_sd: { 8460 unsigned Opc = 0; 8461 ISD::CondCode CC = ISD::SETCC_INVALID; 8462 switch (IntNo) { 8463 default: break; 8464 case Intrinsic::x86_sse_comieq_ss: 8465 case Intrinsic::x86_sse2_comieq_sd: 8466 Opc = X86ISD::COMI; 8467 CC = ISD::SETEQ; 8468 break; 8469 case Intrinsic::x86_sse_comilt_ss: 8470 case Intrinsic::x86_sse2_comilt_sd: 8471 Opc = X86ISD::COMI; 8472 CC = ISD::SETLT; 8473 break; 8474 case Intrinsic::x86_sse_comile_ss: 8475 case Intrinsic::x86_sse2_comile_sd: 8476 Opc = X86ISD::COMI; 8477 CC = ISD::SETLE; 8478 break; 8479 case Intrinsic::x86_sse_comigt_ss: 8480 case Intrinsic::x86_sse2_comigt_sd: 8481 Opc = X86ISD::COMI; 8482 CC = ISD::SETGT; 8483 break; 8484 case Intrinsic::x86_sse_comige_ss: 8485 case Intrinsic::x86_sse2_comige_sd: 8486 Opc = X86ISD::COMI; 8487 CC = ISD::SETGE; 8488 break; 8489 case Intrinsic::x86_sse_comineq_ss: 8490 case Intrinsic::x86_sse2_comineq_sd: 8491 Opc = X86ISD::COMI; 8492 CC = ISD::SETNE; 8493 break; 8494 case Intrinsic::x86_sse_ucomieq_ss: 8495 case Intrinsic::x86_sse2_ucomieq_sd: 8496 Opc = X86ISD::UCOMI; 8497 CC = ISD::SETEQ; 8498 break; 8499 case Intrinsic::x86_sse_ucomilt_ss: 8500 case Intrinsic::x86_sse2_ucomilt_sd: 8501 Opc = X86ISD::UCOMI; 8502 CC = ISD::SETLT; 8503 break; 8504 case Intrinsic::x86_sse_ucomile_ss: 8505 case Intrinsic::x86_sse2_ucomile_sd: 8506 Opc = X86ISD::UCOMI; 8507 CC = ISD::SETLE; 8508 break; 8509 case Intrinsic::x86_sse_ucomigt_ss: 8510 case Intrinsic::x86_sse2_ucomigt_sd: 8511 Opc = X86ISD::UCOMI; 8512 CC = ISD::SETGT; 8513 break; 8514 case Intrinsic::x86_sse_ucomige_ss: 8515 case Intrinsic::x86_sse2_ucomige_sd: 8516 Opc = X86ISD::UCOMI; 8517 CC = ISD::SETGE; 8518 break; 8519 case Intrinsic::x86_sse_ucomineq_ss: 8520 case Intrinsic::x86_sse2_ucomineq_sd: 8521 Opc = X86ISD::UCOMI; 8522 CC = ISD::SETNE; 8523 break; 8524 } 8525 8526 SDValue LHS = Op.getOperand(1); 8527 SDValue RHS = Op.getOperand(2); 8528 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 8529 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 8530 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 8531 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 8532 DAG.getConstant(X86CC, MVT::i8), Cond); 8533 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8534 } 8535 // ptest and testp intrinsics. The intrinsic these come from are designed to 8536 // return an integer value, not just an instruction so lower it to the ptest 8537 // or testp pattern and a setcc for the result. 8538 case Intrinsic::x86_sse41_ptestz: 8539 case Intrinsic::x86_sse41_ptestc: 8540 case Intrinsic::x86_sse41_ptestnzc: 8541 case Intrinsic::x86_avx_ptestz_256: 8542 case Intrinsic::x86_avx_ptestc_256: 8543 case Intrinsic::x86_avx_ptestnzc_256: 8544 case Intrinsic::x86_avx_vtestz_ps: 8545 case Intrinsic::x86_avx_vtestc_ps: 8546 case Intrinsic::x86_avx_vtestnzc_ps: 8547 case Intrinsic::x86_avx_vtestz_pd: 8548 case Intrinsic::x86_avx_vtestc_pd: 8549 case Intrinsic::x86_avx_vtestnzc_pd: 8550 case Intrinsic::x86_avx_vtestz_ps_256: 8551 case Intrinsic::x86_avx_vtestc_ps_256: 8552 case Intrinsic::x86_avx_vtestnzc_ps_256: 8553 case Intrinsic::x86_avx_vtestz_pd_256: 8554 case Intrinsic::x86_avx_vtestc_pd_256: 8555 case Intrinsic::x86_avx_vtestnzc_pd_256: { 8556 bool IsTestPacked = false; 8557 unsigned X86CC = 0; 8558 switch (IntNo) { 8559 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 8560 case Intrinsic::x86_avx_vtestz_ps: 8561 case Intrinsic::x86_avx_vtestz_pd: 8562 case Intrinsic::x86_avx_vtestz_ps_256: 8563 case Intrinsic::x86_avx_vtestz_pd_256: 8564 IsTestPacked = true; // Fallthrough 8565 case Intrinsic::x86_sse41_ptestz: 8566 case Intrinsic::x86_avx_ptestz_256: 8567 // ZF = 1 8568 X86CC = X86::COND_E; 8569 break; 8570 case Intrinsic::x86_avx_vtestc_ps: 8571 case Intrinsic::x86_avx_vtestc_pd: 8572 case Intrinsic::x86_avx_vtestc_ps_256: 8573 case Intrinsic::x86_avx_vtestc_pd_256: 8574 IsTestPacked = true; // Fallthrough 8575 case Intrinsic::x86_sse41_ptestc: 8576 case Intrinsic::x86_avx_ptestc_256: 8577 // CF = 1 8578 X86CC = X86::COND_B; 8579 break; 8580 case Intrinsic::x86_avx_vtestnzc_ps: 8581 case Intrinsic::x86_avx_vtestnzc_pd: 8582 case Intrinsic::x86_avx_vtestnzc_ps_256: 8583 case Intrinsic::x86_avx_vtestnzc_pd_256: 8584 IsTestPacked = true; // Fallthrough 8585 case Intrinsic::x86_sse41_ptestnzc: 8586 case Intrinsic::x86_avx_ptestnzc_256: 8587 // ZF and CF = 0 8588 X86CC = X86::COND_A; 8589 break; 8590 } 8591 8592 SDValue LHS = Op.getOperand(1); 8593 SDValue RHS = Op.getOperand(2); 8594 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 8595 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 8596 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 8597 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 8598 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 8599 } 8600 8601 // Fix vector shift instructions where the last operand is a non-immediate 8602 // i32 value. 8603 case Intrinsic::x86_sse2_pslli_w: 8604 case Intrinsic::x86_sse2_pslli_d: 8605 case Intrinsic::x86_sse2_pslli_q: 8606 case Intrinsic::x86_sse2_psrli_w: 8607 case Intrinsic::x86_sse2_psrli_d: 8608 case Intrinsic::x86_sse2_psrli_q: 8609 case Intrinsic::x86_sse2_psrai_w: 8610 case Intrinsic::x86_sse2_psrai_d: 8611 case Intrinsic::x86_mmx_pslli_w: 8612 case Intrinsic::x86_mmx_pslli_d: 8613 case Intrinsic::x86_mmx_pslli_q: 8614 case Intrinsic::x86_mmx_psrli_w: 8615 case Intrinsic::x86_mmx_psrli_d: 8616 case Intrinsic::x86_mmx_psrli_q: 8617 case Intrinsic::x86_mmx_psrai_w: 8618 case Intrinsic::x86_mmx_psrai_d: { 8619 SDValue ShAmt = Op.getOperand(2); 8620 if (isa<ConstantSDNode>(ShAmt)) 8621 return SDValue(); 8622 8623 unsigned NewIntNo = 0; 8624 EVT ShAmtVT = MVT::v4i32; 8625 switch (IntNo) { 8626 case Intrinsic::x86_sse2_pslli_w: 8627 NewIntNo = Intrinsic::x86_sse2_psll_w; 8628 break; 8629 case Intrinsic::x86_sse2_pslli_d: 8630 NewIntNo = Intrinsic::x86_sse2_psll_d; 8631 break; 8632 case Intrinsic::x86_sse2_pslli_q: 8633 NewIntNo = Intrinsic::x86_sse2_psll_q; 8634 break; 8635 case Intrinsic::x86_sse2_psrli_w: 8636 NewIntNo = Intrinsic::x86_sse2_psrl_w; 8637 break; 8638 case Intrinsic::x86_sse2_psrli_d: 8639 NewIntNo = Intrinsic::x86_sse2_psrl_d; 8640 break; 8641 case Intrinsic::x86_sse2_psrli_q: 8642 NewIntNo = Intrinsic::x86_sse2_psrl_q; 8643 break; 8644 case Intrinsic::x86_sse2_psrai_w: 8645 NewIntNo = Intrinsic::x86_sse2_psra_w; 8646 break; 8647 case Intrinsic::x86_sse2_psrai_d: 8648 NewIntNo = Intrinsic::x86_sse2_psra_d; 8649 break; 8650 default: { 8651 ShAmtVT = MVT::v2i32; 8652 switch (IntNo) { 8653 case Intrinsic::x86_mmx_pslli_w: 8654 NewIntNo = Intrinsic::x86_mmx_psll_w; 8655 break; 8656 case Intrinsic::x86_mmx_pslli_d: 8657 NewIntNo = Intrinsic::x86_mmx_psll_d; 8658 break; 8659 case Intrinsic::x86_mmx_pslli_q: 8660 NewIntNo = Intrinsic::x86_mmx_psll_q; 8661 break; 8662 case Intrinsic::x86_mmx_psrli_w: 8663 NewIntNo = Intrinsic::x86_mmx_psrl_w; 8664 break; 8665 case Intrinsic::x86_mmx_psrli_d: 8666 NewIntNo = Intrinsic::x86_mmx_psrl_d; 8667 break; 8668 case Intrinsic::x86_mmx_psrli_q: 8669 NewIntNo = Intrinsic::x86_mmx_psrl_q; 8670 break; 8671 case Intrinsic::x86_mmx_psrai_w: 8672 NewIntNo = Intrinsic::x86_mmx_psra_w; 8673 break; 8674 case Intrinsic::x86_mmx_psrai_d: 8675 NewIntNo = Intrinsic::x86_mmx_psra_d; 8676 break; 8677 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 8678 } 8679 break; 8680 } 8681 } 8682 8683 // The vector shift intrinsics with scalars uses 32b shift amounts but 8684 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 8685 // to be zero. 8686 SDValue ShOps[4]; 8687 ShOps[0] = ShAmt; 8688 ShOps[1] = DAG.getConstant(0, MVT::i32); 8689 if (ShAmtVT == MVT::v4i32) { 8690 ShOps[2] = DAG.getUNDEF(MVT::i32); 8691 ShOps[3] = DAG.getUNDEF(MVT::i32); 8692 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 8693 } else { 8694 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 8695// FIXME this must be lowered to get rid of the invalid type. 8696 } 8697 8698 EVT VT = Op.getValueType(); 8699 ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); 8700 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8701 DAG.getConstant(NewIntNo, MVT::i32), 8702 Op.getOperand(1), ShAmt); 8703 } 8704 } 8705} 8706 8707SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 8708 SelectionDAG &DAG) const { 8709 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8710 MFI->setReturnAddressIsTaken(true); 8711 8712 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8713 DebugLoc dl = Op.getDebugLoc(); 8714 8715 if (Depth > 0) { 8716 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 8717 SDValue Offset = 8718 DAG.getConstant(TD->getPointerSize(), 8719 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8720 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8721 DAG.getNode(ISD::ADD, dl, getPointerTy(), 8722 FrameAddr, Offset), 8723 MachinePointerInfo(), false, false, 0); 8724 } 8725 8726 // Just load the return address. 8727 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 8728 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 8729 RetAddrFI, MachinePointerInfo(), false, false, 0); 8730} 8731 8732SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 8733 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8734 MFI->setFrameAddressIsTaken(true); 8735 8736 EVT VT = Op.getValueType(); 8737 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 8738 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 8739 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 8740 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 8741 while (Depth--) 8742 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 8743 MachinePointerInfo(), 8744 false, false, 0); 8745 return FrameAddr; 8746} 8747 8748SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 8749 SelectionDAG &DAG) const { 8750 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 8751} 8752 8753SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 8754 MachineFunction &MF = DAG.getMachineFunction(); 8755 SDValue Chain = Op.getOperand(0); 8756 SDValue Offset = Op.getOperand(1); 8757 SDValue Handler = Op.getOperand(2); 8758 DebugLoc dl = Op.getDebugLoc(); 8759 8760 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 8761 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 8762 getPointerTy()); 8763 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 8764 8765 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 8766 DAG.getIntPtrConstant(TD->getPointerSize())); 8767 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 8768 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 8769 false, false, 0); 8770 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 8771 MF.getRegInfo().addLiveOut(StoreAddrReg); 8772 8773 return DAG.getNode(X86ISD::EH_RETURN, dl, 8774 MVT::Other, 8775 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8776} 8777 8778SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8779 SelectionDAG &DAG) const { 8780 SDValue Root = Op.getOperand(0); 8781 SDValue Trmp = Op.getOperand(1); // trampoline 8782 SDValue FPtr = Op.getOperand(2); // nested function 8783 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8784 DebugLoc dl = Op.getDebugLoc(); 8785 8786 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8787 8788 if (Subtarget->is64Bit()) { 8789 SDValue OutChains[6]; 8790 8791 // Large code-model. 8792 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8793 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8794 8795 const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); 8796 const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); 8797 8798 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8799 8800 // Load the pointer to the nested function into R11. 8801 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8802 SDValue Addr = Trmp; 8803 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8804 Addr, MachinePointerInfo(TrmpAddr), 8805 false, false, 0); 8806 8807 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8808 DAG.getConstant(2, MVT::i64)); 8809 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8810 MachinePointerInfo(TrmpAddr, 2), 8811 false, false, 2); 8812 8813 // Load the 'nest' parameter value into R10. 8814 // R10 is specified in X86CallingConv.td 8815 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8816 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8817 DAG.getConstant(10, MVT::i64)); 8818 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8819 Addr, MachinePointerInfo(TrmpAddr, 10), 8820 false, false, 0); 8821 8822 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8823 DAG.getConstant(12, MVT::i64)); 8824 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8825 MachinePointerInfo(TrmpAddr, 12), 8826 false, false, 2); 8827 8828 // Jump to the nested function. 8829 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8830 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8831 DAG.getConstant(20, MVT::i64)); 8832 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8833 Addr, MachinePointerInfo(TrmpAddr, 20), 8834 false, false, 0); 8835 8836 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8837 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8838 DAG.getConstant(22, MVT::i64)); 8839 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8840 MachinePointerInfo(TrmpAddr, 22), 8841 false, false, 0); 8842 8843 SDValue Ops[] = 8844 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8845 return DAG.getMergeValues(Ops, 2, dl); 8846 } else { 8847 const Function *Func = 8848 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8849 CallingConv::ID CC = Func->getCallingConv(); 8850 unsigned NestReg; 8851 8852 switch (CC) { 8853 default: 8854 llvm_unreachable("Unsupported calling convention"); 8855 case CallingConv::C: 8856 case CallingConv::X86_StdCall: { 8857 // Pass 'nest' parameter in ECX. 8858 // Must be kept in sync with X86CallingConv.td 8859 NestReg = X86::ECX; 8860 8861 // Check that ECX wasn't needed by an 'inreg' parameter. 8862 FunctionType *FTy = Func->getFunctionType(); 8863 const AttrListPtr &Attrs = Func->getAttributes(); 8864 8865 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8866 unsigned InRegCount = 0; 8867 unsigned Idx = 1; 8868 8869 for (FunctionType::param_iterator I = FTy->param_begin(), 8870 E = FTy->param_end(); I != E; ++I, ++Idx) 8871 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8872 // FIXME: should only count parameters that are lowered to integers. 8873 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8874 8875 if (InRegCount > 2) { 8876 report_fatal_error("Nest register in use - reduce number of inreg" 8877 " parameters!"); 8878 } 8879 } 8880 break; 8881 } 8882 case CallingConv::X86_FastCall: 8883 case CallingConv::X86_ThisCall: 8884 case CallingConv::Fast: 8885 // Pass 'nest' parameter in EAX. 8886 // Must be kept in sync with X86CallingConv.td 8887 NestReg = X86::EAX; 8888 break; 8889 } 8890 8891 SDValue OutChains[4]; 8892 SDValue Addr, Disp; 8893 8894 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8895 DAG.getConstant(10, MVT::i32)); 8896 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8897 8898 // This is storing the opcode for MOV32ri. 8899 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8900 const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); 8901 OutChains[0] = DAG.getStore(Root, dl, 8902 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8903 Trmp, MachinePointerInfo(TrmpAddr), 8904 false, false, 0); 8905 8906 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8907 DAG.getConstant(1, MVT::i32)); 8908 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8909 MachinePointerInfo(TrmpAddr, 1), 8910 false, false, 1); 8911 8912 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8913 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8914 DAG.getConstant(5, MVT::i32)); 8915 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8916 MachinePointerInfo(TrmpAddr, 5), 8917 false, false, 1); 8918 8919 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8920 DAG.getConstant(6, MVT::i32)); 8921 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8922 MachinePointerInfo(TrmpAddr, 6), 8923 false, false, 1); 8924 8925 SDValue Ops[] = 8926 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8927 return DAG.getMergeValues(Ops, 2, dl); 8928 } 8929} 8930 8931SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8932 SelectionDAG &DAG) const { 8933 /* 8934 The rounding mode is in bits 11:10 of FPSR, and has the following 8935 settings: 8936 00 Round to nearest 8937 01 Round to -inf 8938 10 Round to +inf 8939 11 Round to 0 8940 8941 FLT_ROUNDS, on the other hand, expects the following: 8942 -1 Undefined 8943 0 Round to 0 8944 1 Round to nearest 8945 2 Round to +inf 8946 3 Round to -inf 8947 8948 To perform the conversion, we do: 8949 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8950 */ 8951 8952 MachineFunction &MF = DAG.getMachineFunction(); 8953 const TargetMachine &TM = MF.getTarget(); 8954 const TargetFrameLowering &TFI = *TM.getFrameLowering(); 8955 unsigned StackAlignment = TFI.getStackAlignment(); 8956 EVT VT = Op.getValueType(); 8957 DebugLoc DL = Op.getDebugLoc(); 8958 8959 // Save FP Control Word to stack slot 8960 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8961 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8962 8963 8964 MachineMemOperand *MMO = 8965 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8966 MachineMemOperand::MOStore, 2, 2); 8967 8968 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8969 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8970 DAG.getVTList(MVT::Other), 8971 Ops, 2, MVT::i16, MMO); 8972 8973 // Load FP Control Word from stack slot 8974 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8975 MachinePointerInfo(), false, false, 0); 8976 8977 // Transform as necessary 8978 SDValue CWD1 = 8979 DAG.getNode(ISD::SRL, DL, MVT::i16, 8980 DAG.getNode(ISD::AND, DL, MVT::i16, 8981 CWD, DAG.getConstant(0x800, MVT::i16)), 8982 DAG.getConstant(11, MVT::i8)); 8983 SDValue CWD2 = 8984 DAG.getNode(ISD::SRL, DL, MVT::i16, 8985 DAG.getNode(ISD::AND, DL, MVT::i16, 8986 CWD, DAG.getConstant(0x400, MVT::i16)), 8987 DAG.getConstant(9, MVT::i8)); 8988 8989 SDValue RetVal = 8990 DAG.getNode(ISD::AND, DL, MVT::i16, 8991 DAG.getNode(ISD::ADD, DL, MVT::i16, 8992 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8993 DAG.getConstant(1, MVT::i16)), 8994 DAG.getConstant(3, MVT::i16)); 8995 8996 8997 return DAG.getNode((VT.getSizeInBits() < 16 ? 8998 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8999} 9000 9001SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 9002 EVT VT = Op.getValueType(); 9003 EVT OpVT = VT; 9004 unsigned NumBits = VT.getSizeInBits(); 9005 DebugLoc dl = Op.getDebugLoc(); 9006 9007 Op = Op.getOperand(0); 9008 if (VT == MVT::i8) { 9009 // Zero extend to i32 since there is not an i8 bsr. 9010 OpVT = MVT::i32; 9011 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9012 } 9013 9014 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 9015 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9016 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 9017 9018 // If src is zero (i.e. bsr sets ZF), returns NumBits. 9019 SDValue Ops[] = { 9020 Op, 9021 DAG.getConstant(NumBits+NumBits-1, OpVT), 9022 DAG.getConstant(X86::COND_E, MVT::i8), 9023 Op.getValue(1) 9024 }; 9025 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9026 9027 // Finally xor with NumBits-1. 9028 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 9029 9030 if (VT == MVT::i8) 9031 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9032 return Op; 9033} 9034 9035SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 9036 EVT VT = Op.getValueType(); 9037 EVT OpVT = VT; 9038 unsigned NumBits = VT.getSizeInBits(); 9039 DebugLoc dl = Op.getDebugLoc(); 9040 9041 Op = Op.getOperand(0); 9042 if (VT == MVT::i8) { 9043 OpVT = MVT::i32; 9044 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 9045 } 9046 9047 // Issue a bsf (scan bits forward) which also sets EFLAGS. 9048 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 9049 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 9050 9051 // If src is zero (i.e. bsf sets ZF), returns NumBits. 9052 SDValue Ops[] = { 9053 Op, 9054 DAG.getConstant(NumBits, OpVT), 9055 DAG.getConstant(X86::COND_E, MVT::i8), 9056 Op.getValue(1) 9057 }; 9058 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 9059 9060 if (VT == MVT::i8) 9061 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 9062 return Op; 9063} 9064 9065SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 9066 EVT VT = Op.getValueType(); 9067 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 9068 DebugLoc dl = Op.getDebugLoc(); 9069 9070 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 9071 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 9072 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 9073 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 9074 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 9075 // 9076 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 9077 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 9078 // return AloBlo + AloBhi + AhiBlo; 9079 9080 SDValue A = Op.getOperand(0); 9081 SDValue B = Op.getOperand(1); 9082 9083 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9084 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9085 A, DAG.getConstant(32, MVT::i32)); 9086 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9087 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9088 B, DAG.getConstant(32, MVT::i32)); 9089 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9090 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9091 A, B); 9092 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9093 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9094 A, Bhi); 9095 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9096 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 9097 Ahi, B); 9098 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9099 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9100 AloBhi, DAG.getConstant(32, MVT::i32)); 9101 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9102 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9103 AhiBlo, DAG.getConstant(32, MVT::i32)); 9104 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 9105 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 9106 return Res; 9107} 9108 9109SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 9110 9111 EVT VT = Op.getValueType(); 9112 DebugLoc dl = Op.getDebugLoc(); 9113 SDValue R = Op.getOperand(0); 9114 SDValue Amt = Op.getOperand(1); 9115 9116 LLVMContext *Context = DAG.getContext(); 9117 9118 // Must have SSE2. 9119 if (!Subtarget->hasSSE2()) return SDValue(); 9120 9121 // Optimize shl/srl/sra with constant shift amount. 9122 if (isSplatVector(Amt.getNode())) { 9123 SDValue SclrAmt = Amt->getOperand(0); 9124 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { 9125 uint64_t ShiftAmt = C->getZExtValue(); 9126 9127 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) 9128 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9129 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9130 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9131 9132 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) 9133 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9134 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9135 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9136 9137 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) 9138 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9139 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9140 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9141 9142 if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) 9143 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9144 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9145 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9146 9147 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) 9148 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9149 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9150 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9151 9152 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) 9153 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9154 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9155 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9156 9157 if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) 9158 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9159 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9160 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9161 9162 if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) 9163 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9164 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9165 R, DAG.getConstant(ShiftAmt, MVT::i32)); 9166 } 9167 } 9168 9169 // Lower SHL with variable shift amount. 9170 // Cannot lower SHL without SSE2 or later. 9171 if (!Subtarget->hasSSE2()) return SDValue(); 9172 9173 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { 9174 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9175 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9176 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 9177 9178 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 9179 9180 std::vector<Constant*> CV(4, CI); 9181 Constant *C = ConstantVector::get(CV); 9182 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9183 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9184 MachinePointerInfo::getConstantPool(), 9185 false, false, 16); 9186 9187 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 9188 Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); 9189 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 9190 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 9191 } 9192 if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { 9193 // a = a << 5; 9194 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9195 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9196 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 9197 9198 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 9199 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 9200 9201 std::vector<Constant*> CVM1(16, CM1); 9202 std::vector<Constant*> CVM2(16, CM2); 9203 Constant *C = ConstantVector::get(CVM1); 9204 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9205 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9206 MachinePointerInfo::getConstantPool(), 9207 false, false, 16); 9208 9209 // r = pblendv(r, psllw(r & (char16)15, 4), a); 9210 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9211 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9212 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9213 DAG.getConstant(4, MVT::i32)); 9214 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9215 // a += a 9216 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9217 9218 C = ConstantVector::get(CVM2); 9219 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 9220 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 9221 MachinePointerInfo::getConstantPool(), 9222 false, false, 16); 9223 9224 // r = pblendv(r, psllw(r & (char16)63, 2), a); 9225 M = DAG.getNode(ISD::AND, dl, VT, R, M); 9226 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9227 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 9228 DAG.getConstant(2, MVT::i32)); 9229 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); 9230 // a += a 9231 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 9232 9233 // return pblendv(r, r+r, a); 9234 R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, 9235 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 9236 return R; 9237 } 9238 return SDValue(); 9239} 9240 9241SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 9242 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 9243 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 9244 // looks for this combo and may remove the "setcc" instruction if the "setcc" 9245 // has only one use. 9246 SDNode *N = Op.getNode(); 9247 SDValue LHS = N->getOperand(0); 9248 SDValue RHS = N->getOperand(1); 9249 unsigned BaseOp = 0; 9250 unsigned Cond = 0; 9251 DebugLoc DL = Op.getDebugLoc(); 9252 switch (Op.getOpcode()) { 9253 default: llvm_unreachable("Unknown ovf instruction!"); 9254 case ISD::SADDO: 9255 // A subtract of one will be selected as a INC. Note that INC doesn't 9256 // set CF, so we can't do this for UADDO. 9257 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9258 if (C->isOne()) { 9259 BaseOp = X86ISD::INC; 9260 Cond = X86::COND_O; 9261 break; 9262 } 9263 BaseOp = X86ISD::ADD; 9264 Cond = X86::COND_O; 9265 break; 9266 case ISD::UADDO: 9267 BaseOp = X86ISD::ADD; 9268 Cond = X86::COND_B; 9269 break; 9270 case ISD::SSUBO: 9271 // A subtract of one will be selected as a DEC. Note that DEC doesn't 9272 // set CF, so we can't do this for USUBO. 9273 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) 9274 if (C->isOne()) { 9275 BaseOp = X86ISD::DEC; 9276 Cond = X86::COND_O; 9277 break; 9278 } 9279 BaseOp = X86ISD::SUB; 9280 Cond = X86::COND_O; 9281 break; 9282 case ISD::USUBO: 9283 BaseOp = X86ISD::SUB; 9284 Cond = X86::COND_B; 9285 break; 9286 case ISD::SMULO: 9287 BaseOp = X86ISD::SMUL; 9288 Cond = X86::COND_O; 9289 break; 9290 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs 9291 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), 9292 MVT::i32); 9293 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); 9294 9295 SDValue SetCC = 9296 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9297 DAG.getConstant(X86::COND_O, MVT::i32), 9298 SDValue(Sum.getNode(), 2)); 9299 9300 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9301 } 9302 } 9303 9304 // Also sets EFLAGS. 9305 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 9306 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); 9307 9308 SDValue SetCC = 9309 DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), 9310 DAG.getConstant(Cond, MVT::i32), 9311 SDValue(Sum.getNode(), 1)); 9312 9313 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); 9314} 9315 9316SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ 9317 DebugLoc dl = Op.getDebugLoc(); 9318 SDNode* Node = Op.getNode(); 9319 EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); 9320 EVT VT = Node->getValueType(0); 9321 9322 if (Subtarget->hasSSE2() && VT.isVector()) { 9323 unsigned BitsDiff = VT.getScalarType().getSizeInBits() - 9324 ExtraVT.getScalarType().getSizeInBits(); 9325 SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); 9326 9327 unsigned SHLIntrinsicsID = 0; 9328 unsigned SRAIntrinsicsID = 0; 9329 switch (VT.getSimpleVT().SimpleTy) { 9330 default: 9331 return SDValue(); 9332 case MVT::v2i64: { 9333 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q; 9334 SRAIntrinsicsID = 0; 9335 break; 9336 } 9337 case MVT::v4i32: { 9338 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; 9339 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; 9340 break; 9341 } 9342 case MVT::v8i16: { 9343 SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; 9344 SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; 9345 break; 9346 } 9347 } 9348 9349 SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9350 DAG.getConstant(SHLIntrinsicsID, MVT::i32), 9351 Node->getOperand(0), ShAmt); 9352 9353 // In case of 1 bit sext, no need to shr 9354 if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; 9355 9356 if (SRAIntrinsicsID) { 9357 Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 9358 DAG.getConstant(SRAIntrinsicsID, MVT::i32), 9359 Tmp1, ShAmt); 9360 } 9361 return Tmp1; 9362 } 9363 9364 return SDValue(); 9365} 9366 9367 9368SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 9369 DebugLoc dl = Op.getDebugLoc(); 9370 9371 // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. 9372 // There isn't any reason to disable it if the target processor supports it. 9373 if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { 9374 SDValue Chain = Op.getOperand(0); 9375 SDValue Zero = DAG.getConstant(0, MVT::i32); 9376 SDValue Ops[] = { 9377 DAG.getRegister(X86::ESP, MVT::i32), // Base 9378 DAG.getTargetConstant(1, MVT::i8), // Scale 9379 DAG.getRegister(0, MVT::i32), // Index 9380 DAG.getTargetConstant(0, MVT::i32), // Disp 9381 DAG.getRegister(0, MVT::i32), // Segment. 9382 Zero, 9383 Chain 9384 }; 9385 SDNode *Res = 9386 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9387 array_lengthof(Ops)); 9388 return SDValue(Res, 0); 9389 } 9390 9391 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 9392 if (!isDev) 9393 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9394 9395 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 9396 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 9397 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 9398 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 9399 9400 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 9401 if (!Op1 && !Op2 && !Op3 && Op4) 9402 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 9403 9404 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 9405 if (Op1 && !Op2 && !Op3 && !Op4) 9406 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 9407 9408 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 9409 // (MFENCE)>; 9410 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9411} 9412 9413SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, 9414 SelectionDAG &DAG) const { 9415 DebugLoc dl = Op.getDebugLoc(); 9416 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( 9417 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); 9418 SynchronizationScope FenceScope = static_cast<SynchronizationScope>( 9419 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 9420 9421 // The only fence that needs an instruction is a sequentially-consistent 9422 // cross-thread fence. 9423 if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { 9424 // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for 9425 // no-sse2). There isn't any reason to disable it if the target processor 9426 // supports it. 9427 if (Subtarget->hasSSE2() || Subtarget->is64Bit()) 9428 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 9429 9430 SDValue Chain = Op.getOperand(0); 9431 SDValue Zero = DAG.getConstant(0, MVT::i32); 9432 SDValue Ops[] = { 9433 DAG.getRegister(X86::ESP, MVT::i32), // Base 9434 DAG.getTargetConstant(1, MVT::i8), // Scale 9435 DAG.getRegister(0, MVT::i32), // Index 9436 DAG.getTargetConstant(0, MVT::i32), // Disp 9437 DAG.getRegister(0, MVT::i32), // Segment. 9438 Zero, 9439 Chain 9440 }; 9441 SDNode *Res = 9442 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 9443 array_lengthof(Ops)); 9444 return SDValue(Res, 0); 9445 } 9446 9447 // MEMBARRIER is a compiler barrier; it codegens to a no-op. 9448 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 9449} 9450 9451 9452SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 9453 EVT T = Op.getValueType(); 9454 DebugLoc DL = Op.getDebugLoc(); 9455 unsigned Reg = 0; 9456 unsigned size = 0; 9457 switch(T.getSimpleVT().SimpleTy) { 9458 default: 9459 assert(false && "Invalid value type!"); 9460 case MVT::i8: Reg = X86::AL; size = 1; break; 9461 case MVT::i16: Reg = X86::AX; size = 2; break; 9462 case MVT::i32: Reg = X86::EAX; size = 4; break; 9463 case MVT::i64: 9464 assert(Subtarget->is64Bit() && "Node not type legal!"); 9465 Reg = X86::RAX; size = 8; 9466 break; 9467 } 9468 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 9469 Op.getOperand(2), SDValue()); 9470 SDValue Ops[] = { cpIn.getValue(0), 9471 Op.getOperand(1), 9472 Op.getOperand(3), 9473 DAG.getTargetConstant(size, MVT::i8), 9474 cpIn.getValue(1) }; 9475 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9476 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 9477 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 9478 Ops, 5, T, MMO); 9479 SDValue cpOut = 9480 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 9481 return cpOut; 9482} 9483 9484SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 9485 SelectionDAG &DAG) const { 9486 assert(Subtarget->is64Bit() && "Result not type legalized?"); 9487 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9488 SDValue TheChain = Op.getOperand(0); 9489 DebugLoc dl = Op.getDebugLoc(); 9490 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9491 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 9492 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 9493 rax.getValue(2)); 9494 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 9495 DAG.getConstant(32, MVT::i8)); 9496 SDValue Ops[] = { 9497 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 9498 rdx.getValue(1) 9499 }; 9500 return DAG.getMergeValues(Ops, 2, dl); 9501} 9502 9503SDValue X86TargetLowering::LowerBITCAST(SDValue Op, 9504 SelectionDAG &DAG) const { 9505 EVT SrcVT = Op.getOperand(0).getValueType(); 9506 EVT DstVT = Op.getValueType(); 9507 assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && 9508 Subtarget->hasMMX() && "Unexpected custom BITCAST"); 9509 assert((DstVT == MVT::i64 || 9510 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 9511 "Unexpected custom BITCAST"); 9512 // i64 <=> MMX conversions are Legal. 9513 if (SrcVT==MVT::i64 && DstVT.isVector()) 9514 return Op; 9515 if (DstVT==MVT::i64 && SrcVT.isVector()) 9516 return Op; 9517 // MMX <=> MMX conversions are Legal. 9518 if (SrcVT.isVector() && DstVT.isVector()) 9519 return Op; 9520 // All other conversions need to be expanded. 9521 return SDValue(); 9522} 9523 9524SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 9525 SDNode *Node = Op.getNode(); 9526 DebugLoc dl = Node->getDebugLoc(); 9527 EVT T = Node->getValueType(0); 9528 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 9529 DAG.getConstant(0, T), Node->getOperand(2)); 9530 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 9531 cast<AtomicSDNode>(Node)->getMemoryVT(), 9532 Node->getOperand(0), 9533 Node->getOperand(1), negOp, 9534 cast<AtomicSDNode>(Node)->getSrcValue(), 9535 cast<AtomicSDNode>(Node)->getAlignment()); 9536} 9537 9538static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 9539 EVT VT = Op.getNode()->getValueType(0); 9540 9541 // Let legalize expand this if it isn't a legal type yet. 9542 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 9543 return SDValue(); 9544 9545 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9546 9547 unsigned Opc; 9548 bool ExtraOp = false; 9549 switch (Op.getOpcode()) { 9550 default: assert(0 && "Invalid code"); 9551 case ISD::ADDC: Opc = X86ISD::ADD; break; 9552 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; 9553 case ISD::SUBC: Opc = X86ISD::SUB; break; 9554 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; 9555 } 9556 9557 if (!ExtraOp) 9558 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9559 Op.getOperand(1)); 9560 return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), 9561 Op.getOperand(1), Op.getOperand(2)); 9562} 9563 9564/// LowerOperation - Provide custom lowering hooks for some operations. 9565/// 9566SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9567 switch (Op.getOpcode()) { 9568 default: llvm_unreachable("Should not custom lower this!"); 9569 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); 9570 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 9571 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); 9572 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 9573 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 9574 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 9575 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 9576 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 9577 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 9578 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9579 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 9580 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); 9581 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 9582 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9583 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9584 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9585 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 9586 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9587 case ISD::SHL_PARTS: 9588 case ISD::SRA_PARTS: 9589 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); 9590 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 9591 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 9592 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 9593 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 9594 case ISD::FABS: return LowerFABS(Op, DAG); 9595 case ISD::FNEG: return LowerFNEG(Op, DAG); 9596 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9597 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); 9598 case ISD::SETCC: return LowerSETCC(Op, DAG); 9599 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 9600 case ISD::SELECT: return LowerSELECT(Op, DAG); 9601 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9602 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 9603 case ISD::VASTART: return LowerVASTART(Op, DAG); 9604 case ISD::VAARG: return LowerVAARG(Op, DAG); 9605 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 9606 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 9607 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9608 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9609 case ISD::FRAME_TO_ARGS_OFFSET: 9610 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 9611 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 9612 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 9613 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 9614 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9615 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 9616 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 9617 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 9618 case ISD::SRA: 9619 case ISD::SRL: 9620 case ISD::SHL: return LowerShift(Op, DAG); 9621 case ISD::SADDO: 9622 case ISD::UADDO: 9623 case ISD::SSUBO: 9624 case ISD::USUBO: 9625 case ISD::SMULO: 9626 case ISD::UMULO: return LowerXALUO(Op, DAG); 9627 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 9628 case ISD::BITCAST: return LowerBITCAST(Op, DAG); 9629 case ISD::ADDC: 9630 case ISD::ADDE: 9631 case ISD::SUBC: 9632 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 9633 } 9634} 9635 9636void X86TargetLowering:: 9637ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 9638 SelectionDAG &DAG, unsigned NewOp) const { 9639 EVT T = Node->getValueType(0); 9640 DebugLoc dl = Node->getDebugLoc(); 9641 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 9642 9643 SDValue Chain = Node->getOperand(0); 9644 SDValue In1 = Node->getOperand(1); 9645 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9646 Node->getOperand(2), DAG.getIntPtrConstant(0)); 9647 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9648 Node->getOperand(2), DAG.getIntPtrConstant(1)); 9649 SDValue Ops[] = { Chain, In1, In2L, In2H }; 9650 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 9651 SDValue Result = 9652 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 9653 cast<MemSDNode>(Node)->getMemOperand()); 9654 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 9655 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9656 Results.push_back(Result.getValue(2)); 9657} 9658 9659/// ReplaceNodeResults - Replace a node with an illegal result type 9660/// with a new node built out of custom code. 9661void X86TargetLowering::ReplaceNodeResults(SDNode *N, 9662 SmallVectorImpl<SDValue>&Results, 9663 SelectionDAG &DAG) const { 9664 DebugLoc dl = N->getDebugLoc(); 9665 switch (N->getOpcode()) { 9666 default: 9667 assert(false && "Do not know how to custom type legalize this operation!"); 9668 return; 9669 case ISD::SIGN_EXTEND_INREG: 9670 case ISD::ADDC: 9671 case ISD::ADDE: 9672 case ISD::SUBC: 9673 case ISD::SUBE: 9674 // We don't want to expand or promote these. 9675 return; 9676 case ISD::FP_TO_SINT: { 9677 std::pair<SDValue,SDValue> Vals = 9678 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 9679 SDValue FIST = Vals.first, StackSlot = Vals.second; 9680 if (FIST.getNode() != 0) { 9681 EVT VT = N->getValueType(0); 9682 // Return a load from the stack slot. 9683 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 9684 MachinePointerInfo(), false, false, 0)); 9685 } 9686 return; 9687 } 9688 case ISD::READCYCLECOUNTER: { 9689 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9690 SDValue TheChain = N->getOperand(0); 9691 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 9692 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 9693 rd.getValue(1)); 9694 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 9695 eax.getValue(2)); 9696 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 9697 SDValue Ops[] = { eax, edx }; 9698 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 9699 Results.push_back(edx.getValue(1)); 9700 return; 9701 } 9702 case ISD::ATOMIC_CMP_SWAP: { 9703 EVT T = N->getValueType(0); 9704 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 9705 SDValue cpInL, cpInH; 9706 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9707 DAG.getConstant(0, MVT::i32)); 9708 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 9709 DAG.getConstant(1, MVT::i32)); 9710 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 9711 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 9712 cpInL.getValue(1)); 9713 SDValue swapInL, swapInH; 9714 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9715 DAG.getConstant(0, MVT::i32)); 9716 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 9717 DAG.getConstant(1, MVT::i32)); 9718 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 9719 cpInH.getValue(1)); 9720 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 9721 swapInL.getValue(1)); 9722 SDValue Ops[] = { swapInH.getValue(0), 9723 N->getOperand(1), 9724 swapInH.getValue(1) }; 9725 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); 9726 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 9727 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 9728 Ops, 3, T, MMO); 9729 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 9730 MVT::i32, Result.getValue(1)); 9731 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 9732 MVT::i32, cpOutL.getValue(2)); 9733 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 9734 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 9735 Results.push_back(cpOutH.getValue(1)); 9736 return; 9737 } 9738 case ISD::ATOMIC_LOAD_ADD: 9739 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 9740 return; 9741 case ISD::ATOMIC_LOAD_AND: 9742 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 9743 return; 9744 case ISD::ATOMIC_LOAD_NAND: 9745 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 9746 return; 9747 case ISD::ATOMIC_LOAD_OR: 9748 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 9749 return; 9750 case ISD::ATOMIC_LOAD_SUB: 9751 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 9752 return; 9753 case ISD::ATOMIC_LOAD_XOR: 9754 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 9755 return; 9756 case ISD::ATOMIC_SWAP: 9757 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 9758 return; 9759 } 9760} 9761 9762const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 9763 switch (Opcode) { 9764 default: return NULL; 9765 case X86ISD::BSF: return "X86ISD::BSF"; 9766 case X86ISD::BSR: return "X86ISD::BSR"; 9767 case X86ISD::SHLD: return "X86ISD::SHLD"; 9768 case X86ISD::SHRD: return "X86ISD::SHRD"; 9769 case X86ISD::FAND: return "X86ISD::FAND"; 9770 case X86ISD::FOR: return "X86ISD::FOR"; 9771 case X86ISD::FXOR: return "X86ISD::FXOR"; 9772 case X86ISD::FSRL: return "X86ISD::FSRL"; 9773 case X86ISD::FILD: return "X86ISD::FILD"; 9774 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 9775 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 9776 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 9777 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 9778 case X86ISD::FLD: return "X86ISD::FLD"; 9779 case X86ISD::FST: return "X86ISD::FST"; 9780 case X86ISD::CALL: return "X86ISD::CALL"; 9781 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 9782 case X86ISD::BT: return "X86ISD::BT"; 9783 case X86ISD::CMP: return "X86ISD::CMP"; 9784 case X86ISD::COMI: return "X86ISD::COMI"; 9785 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 9786 case X86ISD::SETCC: return "X86ISD::SETCC"; 9787 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 9788 case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; 9789 case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; 9790 case X86ISD::CMOV: return "X86ISD::CMOV"; 9791 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 9792 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 9793 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 9794 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 9795 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 9796 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 9797 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 9798 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 9799 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 9800 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 9801 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 9802 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 9803 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 9804 case X86ISD::ANDNP: return "X86ISD::ANDNP"; 9805 case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; 9806 case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; 9807 case X86ISD::PSIGND: return "X86ISD::PSIGND"; 9808 case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; 9809 case X86ISD::FMAX: return "X86ISD::FMAX"; 9810 case X86ISD::FMIN: return "X86ISD::FMIN"; 9811 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 9812 case X86ISD::FRCP: return "X86ISD::FRCP"; 9813 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 9814 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 9815 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 9816 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 9817 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 9818 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 9819 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 9820 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 9821 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 9822 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 9823 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 9824 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 9825 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 9826 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 9827 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 9828 case X86ISD::VSHL: return "X86ISD::VSHL"; 9829 case X86ISD::VSRL: return "X86ISD::VSRL"; 9830 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 9831 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 9832 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 9833 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 9834 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 9835 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 9836 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 9837 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 9838 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 9839 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 9840 case X86ISD::ADD: return "X86ISD::ADD"; 9841 case X86ISD::SUB: return "X86ISD::SUB"; 9842 case X86ISD::ADC: return "X86ISD::ADC"; 9843 case X86ISD::SBB: return "X86ISD::SBB"; 9844 case X86ISD::SMUL: return "X86ISD::SMUL"; 9845 case X86ISD::UMUL: return "X86ISD::UMUL"; 9846 case X86ISD::INC: return "X86ISD::INC"; 9847 case X86ISD::DEC: return "X86ISD::DEC"; 9848 case X86ISD::OR: return "X86ISD::OR"; 9849 case X86ISD::XOR: return "X86ISD::XOR"; 9850 case X86ISD::AND: return "X86ISD::AND"; 9851 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 9852 case X86ISD::PTEST: return "X86ISD::PTEST"; 9853 case X86ISD::TESTP: return "X86ISD::TESTP"; 9854 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 9855 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 9856 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 9857 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 9858 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 9859 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 9860 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 9861 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 9862 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 9863 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 9864 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 9865 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 9866 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 9867 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 9868 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 9869 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 9870 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 9871 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 9872 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 9873 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 9874 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 9875 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 9876 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 9877 case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; 9878 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 9879 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 9880 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 9881 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 9882 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 9883 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 9884 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 9885 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 9886 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 9887 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 9888 case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; 9889 case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; 9890 case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; 9891 case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; 9892 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 9893 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 9894 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 9895 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; 9896 } 9897} 9898 9899// isLegalAddressingMode - Return true if the addressing mode represented 9900// by AM is legal for this target, for a load/store of the specified type. 9901bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 9902 Type *Ty) const { 9903 // X86 supports extremely general addressing modes. 9904 CodeModel::Model M = getTargetMachine().getCodeModel(); 9905 Reloc::Model R = getTargetMachine().getRelocationModel(); 9906 9907 // X86 allows a sign-extended 32-bit immediate field as a displacement. 9908 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 9909 return false; 9910 9911 if (AM.BaseGV) { 9912 unsigned GVFlags = 9913 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 9914 9915 // If a reference to this global requires an extra load, we can't fold it. 9916 if (isGlobalStubReference(GVFlags)) 9917 return false; 9918 9919 // If BaseGV requires a register for the PIC base, we cannot also have a 9920 // BaseReg specified. 9921 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 9922 return false; 9923 9924 // If lower 4G is not available, then we must use rip-relative addressing. 9925 if ((M != CodeModel::Small || R != Reloc::Static) && 9926 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 9927 return false; 9928 } 9929 9930 switch (AM.Scale) { 9931 case 0: 9932 case 1: 9933 case 2: 9934 case 4: 9935 case 8: 9936 // These scales always work. 9937 break; 9938 case 3: 9939 case 5: 9940 case 9: 9941 // These scales are formed with basereg+scalereg. Only accept if there is 9942 // no basereg yet. 9943 if (AM.HasBaseReg) 9944 return false; 9945 break; 9946 default: // Other stuff never works. 9947 return false; 9948 } 9949 9950 return true; 9951} 9952 9953 9954bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 9955 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 9956 return false; 9957 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 9958 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 9959 if (NumBits1 <= NumBits2) 9960 return false; 9961 return true; 9962} 9963 9964bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 9965 if (!VT1.isInteger() || !VT2.isInteger()) 9966 return false; 9967 unsigned NumBits1 = VT1.getSizeInBits(); 9968 unsigned NumBits2 = VT2.getSizeInBits(); 9969 if (NumBits1 <= NumBits2) 9970 return false; 9971 return true; 9972} 9973 9974bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 9975 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9976 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 9977} 9978 9979bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 9980 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9981 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9982} 9983 9984bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9985 // i16 instructions are longer (0x66 prefix) and potentially slower. 9986 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9987} 9988 9989/// isShuffleMaskLegal - Targets can use this to indicate that they only 9990/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9991/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9992/// are assumed to be legal. 9993bool 9994X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9995 EVT VT) const { 9996 // Very little shuffling can be done for 64-bit vectors right now. 9997 if (VT.getSizeInBits() == 64) 9998 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9999 10000 // FIXME: pshufb, blends, shifts. 10001 return (VT.getVectorNumElements() == 2 || 10002 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 10003 isMOVLMask(M, VT) || 10004 isSHUFPMask(M, VT) || 10005 isPSHUFDMask(M, VT) || 10006 isPSHUFHWMask(M, VT) || 10007 isPSHUFLWMask(M, VT) || 10008 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 10009 isUNPCKLMask(M, VT) || 10010 isUNPCKHMask(M, VT) || 10011 isUNPCKL_v_undef_Mask(M, VT) || 10012 isUNPCKH_v_undef_Mask(M, VT)); 10013} 10014 10015bool 10016X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 10017 EVT VT) const { 10018 unsigned NumElts = VT.getVectorNumElements(); 10019 // FIXME: This collection of masks seems suspect. 10020 if (NumElts == 2) 10021 return true; 10022 if (NumElts == 4 && VT.getSizeInBits() == 128) { 10023 return (isMOVLMask(Mask, VT) || 10024 isCommutedMOVLMask(Mask, VT, true) || 10025 isSHUFPMask(Mask, VT) || 10026 isCommutedSHUFPMask(Mask, VT)); 10027 } 10028 return false; 10029} 10030 10031//===----------------------------------------------------------------------===// 10032// X86 Scheduler Hooks 10033//===----------------------------------------------------------------------===// 10034 10035// private utility function 10036MachineBasicBlock * 10037X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 10038 MachineBasicBlock *MBB, 10039 unsigned regOpc, 10040 unsigned immOpc, 10041 unsigned LoadOpc, 10042 unsigned CXchgOpc, 10043 unsigned notOpc, 10044 unsigned EAXreg, 10045 TargetRegisterClass *RC, 10046 bool invSrc) const { 10047 // For the atomic bitwise operator, we generate 10048 // thisMBB: 10049 // newMBB: 10050 // ld t1 = [bitinstr.addr] 10051 // op t2 = t1, [bitinstr.val] 10052 // mov EAX = t1 10053 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10054 // bz newMBB 10055 // fallthrough -->nextMBB 10056 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10057 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10058 MachineFunction::iterator MBBIter = MBB; 10059 ++MBBIter; 10060 10061 /// First build the CFG 10062 MachineFunction *F = MBB->getParent(); 10063 MachineBasicBlock *thisMBB = MBB; 10064 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10065 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10066 F->insert(MBBIter, newMBB); 10067 F->insert(MBBIter, nextMBB); 10068 10069 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10070 nextMBB->splice(nextMBB->begin(), thisMBB, 10071 llvm::next(MachineBasicBlock::iterator(bInstr)), 10072 thisMBB->end()); 10073 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10074 10075 // Update thisMBB to fall through to newMBB 10076 thisMBB->addSuccessor(newMBB); 10077 10078 // newMBB jumps to itself and fall through to nextMBB 10079 newMBB->addSuccessor(nextMBB); 10080 newMBB->addSuccessor(newMBB); 10081 10082 // Insert instructions into newMBB based on incoming instruction 10083 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10084 "unexpected number of operands"); 10085 DebugLoc dl = bInstr->getDebugLoc(); 10086 MachineOperand& destOper = bInstr->getOperand(0); 10087 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10088 int numArgs = bInstr->getNumOperands() - 1; 10089 for (int i=0; i < numArgs; ++i) 10090 argOpers[i] = &bInstr->getOperand(i+1); 10091 10092 // x86 address has 4 operands: base, index, scale, and displacement 10093 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10094 int valArgIndx = lastAddrIndx + 1; 10095 10096 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10097 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 10098 for (int i=0; i <= lastAddrIndx; ++i) 10099 (*MIB).addOperand(*argOpers[i]); 10100 10101 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 10102 if (invSrc) { 10103 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 10104 } 10105 else 10106 tt = t1; 10107 10108 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10109 assert((argOpers[valArgIndx]->isReg() || 10110 argOpers[valArgIndx]->isImm()) && 10111 "invalid operand"); 10112 if (argOpers[valArgIndx]->isReg()) 10113 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 10114 else 10115 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 10116 MIB.addReg(tt); 10117 (*MIB).addOperand(*argOpers[valArgIndx]); 10118 10119 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 10120 MIB.addReg(t1); 10121 10122 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 10123 for (int i=0; i <= lastAddrIndx; ++i) 10124 (*MIB).addOperand(*argOpers[i]); 10125 MIB.addReg(t2); 10126 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10127 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10128 bInstr->memoperands_end()); 10129 10130 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10131 MIB.addReg(EAXreg); 10132 10133 // insert branch 10134 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10135 10136 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10137 return nextMBB; 10138} 10139 10140// private utility function: 64 bit atomics on 32 bit host. 10141MachineBasicBlock * 10142X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 10143 MachineBasicBlock *MBB, 10144 unsigned regOpcL, 10145 unsigned regOpcH, 10146 unsigned immOpcL, 10147 unsigned immOpcH, 10148 bool invSrc) const { 10149 // For the atomic bitwise operator, we generate 10150 // thisMBB (instructions are in pairs, except cmpxchg8b) 10151 // ld t1,t2 = [bitinstr.addr] 10152 // newMBB: 10153 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 10154 // op t5, t6 <- out1, out2, [bitinstr.val] 10155 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 10156 // mov ECX, EBX <- t5, t6 10157 // mov EAX, EDX <- t1, t2 10158 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 10159 // mov t3, t4 <- EAX, EDX 10160 // bz newMBB 10161 // result in out1, out2 10162 // fallthrough -->nextMBB 10163 10164 const TargetRegisterClass *RC = X86::GR32RegisterClass; 10165 const unsigned LoadOpc = X86::MOV32rm; 10166 const unsigned NotOpc = X86::NOT32r; 10167 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10168 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10169 MachineFunction::iterator MBBIter = MBB; 10170 ++MBBIter; 10171 10172 /// First build the CFG 10173 MachineFunction *F = MBB->getParent(); 10174 MachineBasicBlock *thisMBB = MBB; 10175 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10176 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10177 F->insert(MBBIter, newMBB); 10178 F->insert(MBBIter, nextMBB); 10179 10180 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10181 nextMBB->splice(nextMBB->begin(), thisMBB, 10182 llvm::next(MachineBasicBlock::iterator(bInstr)), 10183 thisMBB->end()); 10184 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10185 10186 // Update thisMBB to fall through to newMBB 10187 thisMBB->addSuccessor(newMBB); 10188 10189 // newMBB jumps to itself and fall through to nextMBB 10190 newMBB->addSuccessor(nextMBB); 10191 newMBB->addSuccessor(newMBB); 10192 10193 DebugLoc dl = bInstr->getDebugLoc(); 10194 // Insert instructions into newMBB based on incoming instruction 10195 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 10196 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 10197 "unexpected number of operands"); 10198 MachineOperand& dest1Oper = bInstr->getOperand(0); 10199 MachineOperand& dest2Oper = bInstr->getOperand(1); 10200 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10201 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 10202 argOpers[i] = &bInstr->getOperand(i+2); 10203 10204 // We use some of the operands multiple times, so conservatively just 10205 // clear any kill flags that might be present. 10206 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 10207 argOpers[i]->setIsKill(false); 10208 } 10209 10210 // x86 address has 5 operands: base, index, scale, displacement, and segment. 10211 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10212 10213 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 10214 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 10215 for (int i=0; i <= lastAddrIndx; ++i) 10216 (*MIB).addOperand(*argOpers[i]); 10217 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 10218 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 10219 // add 4 to displacement. 10220 for (int i=0; i <= lastAddrIndx-2; ++i) 10221 (*MIB).addOperand(*argOpers[i]); 10222 MachineOperand newOp3 = *(argOpers[3]); 10223 if (newOp3.isImm()) 10224 newOp3.setImm(newOp3.getImm()+4); 10225 else 10226 newOp3.setOffset(newOp3.getOffset()+4); 10227 (*MIB).addOperand(newOp3); 10228 (*MIB).addOperand(*argOpers[lastAddrIndx]); 10229 10230 // t3/4 are defined later, at the bottom of the loop 10231 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 10232 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 10233 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 10234 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 10235 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 10236 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 10237 10238 // The subsequent operations should be using the destination registers of 10239 //the PHI instructions. 10240 if (invSrc) { 10241 t1 = F->getRegInfo().createVirtualRegister(RC); 10242 t2 = F->getRegInfo().createVirtualRegister(RC); 10243 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 10244 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 10245 } else { 10246 t1 = dest1Oper.getReg(); 10247 t2 = dest2Oper.getReg(); 10248 } 10249 10250 int valArgIndx = lastAddrIndx + 1; 10251 assert((argOpers[valArgIndx]->isReg() || 10252 argOpers[valArgIndx]->isImm()) && 10253 "invalid operand"); 10254 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 10255 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 10256 if (argOpers[valArgIndx]->isReg()) 10257 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 10258 else 10259 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 10260 if (regOpcL != X86::MOV32rr) 10261 MIB.addReg(t1); 10262 (*MIB).addOperand(*argOpers[valArgIndx]); 10263 assert(argOpers[valArgIndx + 1]->isReg() == 10264 argOpers[valArgIndx]->isReg()); 10265 assert(argOpers[valArgIndx + 1]->isImm() == 10266 argOpers[valArgIndx]->isImm()); 10267 if (argOpers[valArgIndx + 1]->isReg()) 10268 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 10269 else 10270 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 10271 if (regOpcH != X86::MOV32rr) 10272 MIB.addReg(t2); 10273 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 10274 10275 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10276 MIB.addReg(t1); 10277 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 10278 MIB.addReg(t2); 10279 10280 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 10281 MIB.addReg(t5); 10282 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 10283 MIB.addReg(t6); 10284 10285 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 10286 for (int i=0; i <= lastAddrIndx; ++i) 10287 (*MIB).addOperand(*argOpers[i]); 10288 10289 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10290 (*MIB).setMemRefs(bInstr->memoperands_begin(), 10291 bInstr->memoperands_end()); 10292 10293 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 10294 MIB.addReg(X86::EAX); 10295 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 10296 MIB.addReg(X86::EDX); 10297 10298 // insert branch 10299 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10300 10301 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 10302 return nextMBB; 10303} 10304 10305// private utility function 10306MachineBasicBlock * 10307X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 10308 MachineBasicBlock *MBB, 10309 unsigned cmovOpc) const { 10310 // For the atomic min/max operator, we generate 10311 // thisMBB: 10312 // newMBB: 10313 // ld t1 = [min/max.addr] 10314 // mov t2 = [min/max.val] 10315 // cmp t1, t2 10316 // cmov[cond] t2 = t1 10317 // mov EAX = t1 10318 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 10319 // bz newMBB 10320 // fallthrough -->nextMBB 10321 // 10322 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10323 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10324 MachineFunction::iterator MBBIter = MBB; 10325 ++MBBIter; 10326 10327 /// First build the CFG 10328 MachineFunction *F = MBB->getParent(); 10329 MachineBasicBlock *thisMBB = MBB; 10330 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 10331 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 10332 F->insert(MBBIter, newMBB); 10333 F->insert(MBBIter, nextMBB); 10334 10335 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 10336 nextMBB->splice(nextMBB->begin(), thisMBB, 10337 llvm::next(MachineBasicBlock::iterator(mInstr)), 10338 thisMBB->end()); 10339 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10340 10341 // Update thisMBB to fall through to newMBB 10342 thisMBB->addSuccessor(newMBB); 10343 10344 // newMBB jumps to newMBB and fall through to nextMBB 10345 newMBB->addSuccessor(nextMBB); 10346 newMBB->addSuccessor(newMBB); 10347 10348 DebugLoc dl = mInstr->getDebugLoc(); 10349 // Insert instructions into newMBB based on incoming instruction 10350 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 10351 "unexpected number of operands"); 10352 MachineOperand& destOper = mInstr->getOperand(0); 10353 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 10354 int numArgs = mInstr->getNumOperands() - 1; 10355 for (int i=0; i < numArgs; ++i) 10356 argOpers[i] = &mInstr->getOperand(i+1); 10357 10358 // x86 address has 4 operands: base, index, scale, and displacement 10359 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 10360 int valArgIndx = lastAddrIndx + 1; 10361 10362 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10363 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 10364 for (int i=0; i <= lastAddrIndx; ++i) 10365 (*MIB).addOperand(*argOpers[i]); 10366 10367 // We only support register and immediate values 10368 assert((argOpers[valArgIndx]->isReg() || 10369 argOpers[valArgIndx]->isImm()) && 10370 "invalid operand"); 10371 10372 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10373 if (argOpers[valArgIndx]->isReg()) 10374 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 10375 else 10376 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 10377 (*MIB).addOperand(*argOpers[valArgIndx]); 10378 10379 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 10380 MIB.addReg(t1); 10381 10382 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 10383 MIB.addReg(t1); 10384 MIB.addReg(t2); 10385 10386 // Generate movc 10387 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 10388 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 10389 MIB.addReg(t2); 10390 MIB.addReg(t1); 10391 10392 // Cmp and exchange if none has modified the memory location 10393 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 10394 for (int i=0; i <= lastAddrIndx; ++i) 10395 (*MIB).addOperand(*argOpers[i]); 10396 MIB.addReg(t3); 10397 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 10398 (*MIB).setMemRefs(mInstr->memoperands_begin(), 10399 mInstr->memoperands_end()); 10400 10401 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 10402 MIB.addReg(X86::EAX); 10403 10404 // insert branch 10405 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 10406 10407 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 10408 return nextMBB; 10409} 10410 10411// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 10412// or XMM0_V32I8 in AVX all of this code can be replaced with that 10413// in the .td file. 10414MachineBasicBlock * 10415X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 10416 unsigned numArgs, bool memArg) const { 10417 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 10418 "Target must have SSE4.2 or AVX features enabled"); 10419 10420 DebugLoc dl = MI->getDebugLoc(); 10421 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10422 unsigned Opc; 10423 if (!Subtarget->hasAVX()) { 10424 if (memArg) 10425 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 10426 else 10427 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 10428 } else { 10429 if (memArg) 10430 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 10431 else 10432 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 10433 } 10434 10435 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); 10436 for (unsigned i = 0; i < numArgs; ++i) { 10437 MachineOperand &Op = MI->getOperand(i+1); 10438 if (!(Op.isReg() && Op.isImplicit())) 10439 MIB.addOperand(Op); 10440 } 10441 BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 10442 .addReg(X86::XMM0); 10443 10444 MI->eraseFromParent(); 10445 return BB; 10446} 10447 10448MachineBasicBlock * 10449X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { 10450 DebugLoc dl = MI->getDebugLoc(); 10451 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10452 10453 // Address into RAX/EAX, other two args into ECX, EDX. 10454 unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; 10455 unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 10456 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); 10457 for (int i = 0; i < X86::AddrNumOperands; ++i) 10458 MIB.addOperand(MI->getOperand(i)); 10459 10460 unsigned ValOps = X86::AddrNumOperands; 10461 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10462 .addReg(MI->getOperand(ValOps).getReg()); 10463 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) 10464 .addReg(MI->getOperand(ValOps+1).getReg()); 10465 10466 // The instruction doesn't actually take any operands though. 10467 BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); 10468 10469 MI->eraseFromParent(); // The pseudo is gone now. 10470 return BB; 10471} 10472 10473MachineBasicBlock * 10474X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { 10475 DebugLoc dl = MI->getDebugLoc(); 10476 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10477 10478 // First arg in ECX, the second in EAX. 10479 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) 10480 .addReg(MI->getOperand(0).getReg()); 10481 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) 10482 .addReg(MI->getOperand(1).getReg()); 10483 10484 // The instruction doesn't actually take any operands though. 10485 BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); 10486 10487 MI->eraseFromParent(); // The pseudo is gone now. 10488 return BB; 10489} 10490 10491MachineBasicBlock * 10492X86TargetLowering::EmitVAARG64WithCustomInserter( 10493 MachineInstr *MI, 10494 MachineBasicBlock *MBB) const { 10495 // Emit va_arg instruction on X86-64. 10496 10497 // Operands to this pseudo-instruction: 10498 // 0 ) Output : destination address (reg) 10499 // 1-5) Input : va_list address (addr, i64mem) 10500 // 6 ) ArgSize : Size (in bytes) of vararg type 10501 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 10502 // 8 ) Align : Alignment of type 10503 // 9 ) EFLAGS (implicit-def) 10504 10505 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 10506 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 10507 10508 unsigned DestReg = MI->getOperand(0).getReg(); 10509 MachineOperand &Base = MI->getOperand(1); 10510 MachineOperand &Scale = MI->getOperand(2); 10511 MachineOperand &Index = MI->getOperand(3); 10512 MachineOperand &Disp = MI->getOperand(4); 10513 MachineOperand &Segment = MI->getOperand(5); 10514 unsigned ArgSize = MI->getOperand(6).getImm(); 10515 unsigned ArgMode = MI->getOperand(7).getImm(); 10516 unsigned Align = MI->getOperand(8).getImm(); 10517 10518 // Memory Reference 10519 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 10520 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 10521 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 10522 10523 // Machine Information 10524 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10525 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 10526 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 10527 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 10528 DebugLoc DL = MI->getDebugLoc(); 10529 10530 // struct va_list { 10531 // i32 gp_offset 10532 // i32 fp_offset 10533 // i64 overflow_area (address) 10534 // i64 reg_save_area (address) 10535 // } 10536 // sizeof(va_list) = 24 10537 // alignment(va_list) = 8 10538 10539 unsigned TotalNumIntRegs = 6; 10540 unsigned TotalNumXMMRegs = 8; 10541 bool UseGPOffset = (ArgMode == 1); 10542 bool UseFPOffset = (ArgMode == 2); 10543 unsigned MaxOffset = TotalNumIntRegs * 8 + 10544 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 10545 10546 /* Align ArgSize to a multiple of 8 */ 10547 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 10548 bool NeedsAlign = (Align > 8); 10549 10550 MachineBasicBlock *thisMBB = MBB; 10551 MachineBasicBlock *overflowMBB; 10552 MachineBasicBlock *offsetMBB; 10553 MachineBasicBlock *endMBB; 10554 10555 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 10556 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 10557 unsigned OffsetReg = 0; 10558 10559 if (!UseGPOffset && !UseFPOffset) { 10560 // If we only pull from the overflow region, we don't create a branch. 10561 // We don't need to alter control flow. 10562 OffsetDestReg = 0; // unused 10563 OverflowDestReg = DestReg; 10564 10565 offsetMBB = NULL; 10566 overflowMBB = thisMBB; 10567 endMBB = thisMBB; 10568 } else { 10569 // First emit code to check if gp_offset (or fp_offset) is below the bound. 10570 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 10571 // If not, pull from overflow_area. (branch to overflowMBB) 10572 // 10573 // thisMBB 10574 // | . 10575 // | . 10576 // offsetMBB overflowMBB 10577 // | . 10578 // | . 10579 // endMBB 10580 10581 // Registers for the PHI in endMBB 10582 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 10583 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 10584 10585 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10586 MachineFunction *MF = MBB->getParent(); 10587 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10588 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10589 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10590 10591 MachineFunction::iterator MBBIter = MBB; 10592 ++MBBIter; 10593 10594 // Insert the new basic blocks 10595 MF->insert(MBBIter, offsetMBB); 10596 MF->insert(MBBIter, overflowMBB); 10597 MF->insert(MBBIter, endMBB); 10598 10599 // Transfer the remainder of MBB and its successor edges to endMBB. 10600 endMBB->splice(endMBB->begin(), thisMBB, 10601 llvm::next(MachineBasicBlock::iterator(MI)), 10602 thisMBB->end()); 10603 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 10604 10605 // Make offsetMBB and overflowMBB successors of thisMBB 10606 thisMBB->addSuccessor(offsetMBB); 10607 thisMBB->addSuccessor(overflowMBB); 10608 10609 // endMBB is a successor of both offsetMBB and overflowMBB 10610 offsetMBB->addSuccessor(endMBB); 10611 overflowMBB->addSuccessor(endMBB); 10612 10613 // Load the offset value into a register 10614 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10615 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 10616 .addOperand(Base) 10617 .addOperand(Scale) 10618 .addOperand(Index) 10619 .addDisp(Disp, UseFPOffset ? 4 : 0) 10620 .addOperand(Segment) 10621 .setMemRefs(MMOBegin, MMOEnd); 10622 10623 // Check if there is enough room left to pull this argument. 10624 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 10625 .addReg(OffsetReg) 10626 .addImm(MaxOffset + 8 - ArgSizeA8); 10627 10628 // Branch to "overflowMBB" if offset >= max 10629 // Fall through to "offsetMBB" otherwise 10630 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 10631 .addMBB(overflowMBB); 10632 } 10633 10634 // In offsetMBB, emit code to use the reg_save_area. 10635 if (offsetMBB) { 10636 assert(OffsetReg != 0); 10637 10638 // Read the reg_save_area address. 10639 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 10640 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 10641 .addOperand(Base) 10642 .addOperand(Scale) 10643 .addOperand(Index) 10644 .addDisp(Disp, 16) 10645 .addOperand(Segment) 10646 .setMemRefs(MMOBegin, MMOEnd); 10647 10648 // Zero-extend the offset 10649 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 10650 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 10651 .addImm(0) 10652 .addReg(OffsetReg) 10653 .addImm(X86::sub_32bit); 10654 10655 // Add the offset to the reg_save_area to get the final address. 10656 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 10657 .addReg(OffsetReg64) 10658 .addReg(RegSaveReg); 10659 10660 // Compute the offset for the next argument 10661 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 10662 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 10663 .addReg(OffsetReg) 10664 .addImm(UseFPOffset ? 16 : 8); 10665 10666 // Store it back into the va_list. 10667 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 10668 .addOperand(Base) 10669 .addOperand(Scale) 10670 .addOperand(Index) 10671 .addDisp(Disp, UseFPOffset ? 4 : 0) 10672 .addOperand(Segment) 10673 .addReg(NextOffsetReg) 10674 .setMemRefs(MMOBegin, MMOEnd); 10675 10676 // Jump to endMBB 10677 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 10678 .addMBB(endMBB); 10679 } 10680 10681 // 10682 // Emit code to use overflow area 10683 // 10684 10685 // Load the overflow_area address into a register. 10686 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 10687 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 10688 .addOperand(Base) 10689 .addOperand(Scale) 10690 .addOperand(Index) 10691 .addDisp(Disp, 8) 10692 .addOperand(Segment) 10693 .setMemRefs(MMOBegin, MMOEnd); 10694 10695 // If we need to align it, do so. Otherwise, just copy the address 10696 // to OverflowDestReg. 10697 if (NeedsAlign) { 10698 // Align the overflow address 10699 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 10700 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 10701 10702 // aligned_addr = (addr + (align-1)) & ~(align-1) 10703 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 10704 .addReg(OverflowAddrReg) 10705 .addImm(Align-1); 10706 10707 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 10708 .addReg(TmpReg) 10709 .addImm(~(uint64_t)(Align-1)); 10710 } else { 10711 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 10712 .addReg(OverflowAddrReg); 10713 } 10714 10715 // Compute the next overflow address after this argument. 10716 // (the overflow address should be kept 8-byte aligned) 10717 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 10718 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 10719 .addReg(OverflowDestReg) 10720 .addImm(ArgSizeA8); 10721 10722 // Store the new overflow address. 10723 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 10724 .addOperand(Base) 10725 .addOperand(Scale) 10726 .addOperand(Index) 10727 .addDisp(Disp, 8) 10728 .addOperand(Segment) 10729 .addReg(NextAddrReg) 10730 .setMemRefs(MMOBegin, MMOEnd); 10731 10732 // If we branched, emit the PHI to the front of endMBB. 10733 if (offsetMBB) { 10734 BuildMI(*endMBB, endMBB->begin(), DL, 10735 TII->get(X86::PHI), DestReg) 10736 .addReg(OffsetDestReg).addMBB(offsetMBB) 10737 .addReg(OverflowDestReg).addMBB(overflowMBB); 10738 } 10739 10740 // Erase the pseudo instruction 10741 MI->eraseFromParent(); 10742 10743 return endMBB; 10744} 10745 10746MachineBasicBlock * 10747X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 10748 MachineInstr *MI, 10749 MachineBasicBlock *MBB) const { 10750 // Emit code to save XMM registers to the stack. The ABI says that the 10751 // number of registers to save is given in %al, so it's theoretically 10752 // possible to do an indirect jump trick to avoid saving all of them, 10753 // however this code takes a simpler approach and just executes all 10754 // of the stores if %al is non-zero. It's less code, and it's probably 10755 // easier on the hardware branch predictor, and stores aren't all that 10756 // expensive anyway. 10757 10758 // Create the new basic blocks. One block contains all the XMM stores, 10759 // and one block is the final destination regardless of whether any 10760 // stores were performed. 10761 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 10762 MachineFunction *F = MBB->getParent(); 10763 MachineFunction::iterator MBBIter = MBB; 10764 ++MBBIter; 10765 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 10766 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 10767 F->insert(MBBIter, XMMSaveMBB); 10768 F->insert(MBBIter, EndMBB); 10769 10770 // Transfer the remainder of MBB and its successor edges to EndMBB. 10771 EndMBB->splice(EndMBB->begin(), MBB, 10772 llvm::next(MachineBasicBlock::iterator(MI)), 10773 MBB->end()); 10774 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 10775 10776 // The original block will now fall through to the XMM save block. 10777 MBB->addSuccessor(XMMSaveMBB); 10778 // The XMMSaveMBB will fall through to the end block. 10779 XMMSaveMBB->addSuccessor(EndMBB); 10780 10781 // Now add the instructions. 10782 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10783 DebugLoc DL = MI->getDebugLoc(); 10784 10785 unsigned CountReg = MI->getOperand(0).getReg(); 10786 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 10787 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 10788 10789 if (!Subtarget->isTargetWin64()) { 10790 // If %al is 0, branch around the XMM save block. 10791 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 10792 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 10793 MBB->addSuccessor(EndMBB); 10794 } 10795 10796 // In the XMM save block, save all the XMM argument registers. 10797 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 10798 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 10799 MachineMemOperand *MMO = 10800 F->getMachineMemOperand( 10801 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 10802 MachineMemOperand::MOStore, 10803 /*Size=*/16, /*Align=*/16); 10804 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 10805 .addFrameIndex(RegSaveFrameIndex) 10806 .addImm(/*Scale=*/1) 10807 .addReg(/*IndexReg=*/0) 10808 .addImm(/*Disp=*/Offset) 10809 .addReg(/*Segment=*/0) 10810 .addReg(MI->getOperand(i).getReg()) 10811 .addMemOperand(MMO); 10812 } 10813 10814 MI->eraseFromParent(); // The pseudo instruction is gone now. 10815 10816 return EndMBB; 10817} 10818 10819MachineBasicBlock * 10820X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 10821 MachineBasicBlock *BB) const { 10822 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10823 DebugLoc DL = MI->getDebugLoc(); 10824 10825 // To "insert" a SELECT_CC instruction, we actually have to insert the 10826 // diamond control-flow pattern. The incoming instruction knows the 10827 // destination vreg to set, the condition code register to branch on, the 10828 // true/false values to select between, and a branch opcode to use. 10829 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10830 MachineFunction::iterator It = BB; 10831 ++It; 10832 10833 // thisMBB: 10834 // ... 10835 // TrueVal = ... 10836 // cmpTY ccX, r1, r2 10837 // bCC copy1MBB 10838 // fallthrough --> copy0MBB 10839 MachineBasicBlock *thisMBB = BB; 10840 MachineFunction *F = BB->getParent(); 10841 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 10842 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 10843 F->insert(It, copy0MBB); 10844 F->insert(It, sinkMBB); 10845 10846 // If the EFLAGS register isn't dead in the terminator, then claim that it's 10847 // live into the sink and copy blocks. 10848 const MachineFunction *MF = BB->getParent(); 10849 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 10850 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 10851 10852 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 10853 const MachineOperand &MO = MI->getOperand(I); 10854 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 10855 unsigned Reg = MO.getReg(); 10856 if (Reg != X86::EFLAGS) continue; 10857 copy0MBB->addLiveIn(Reg); 10858 sinkMBB->addLiveIn(Reg); 10859 } 10860 10861 // Transfer the remainder of BB and its successor edges to sinkMBB. 10862 sinkMBB->splice(sinkMBB->begin(), BB, 10863 llvm::next(MachineBasicBlock::iterator(MI)), 10864 BB->end()); 10865 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 10866 10867 // Add the true and fallthrough blocks as its successors. 10868 BB->addSuccessor(copy0MBB); 10869 BB->addSuccessor(sinkMBB); 10870 10871 // Create the conditional branch instruction. 10872 unsigned Opc = 10873 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 10874 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 10875 10876 // copy0MBB: 10877 // %FalseValue = ... 10878 // # fallthrough to sinkMBB 10879 copy0MBB->addSuccessor(sinkMBB); 10880 10881 // sinkMBB: 10882 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 10883 // ... 10884 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 10885 TII->get(X86::PHI), MI->getOperand(0).getReg()) 10886 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 10887 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 10888 10889 MI->eraseFromParent(); // The pseudo instruction is gone now. 10890 return sinkMBB; 10891} 10892 10893MachineBasicBlock * 10894X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 10895 MachineBasicBlock *BB) const { 10896 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 10897 DebugLoc DL = MI->getDebugLoc(); 10898 10899 assert(!Subtarget->isTargetEnvMacho()); 10900 10901 // The lowering is pretty easy: we're just emitting the call to _alloca. The 10902 // non-trivial part is impdef of ESP. 10903 10904 if (Subtarget->isTargetWin64()) { 10905 if (Subtarget->isTargetCygMing()) { 10906 // ___chkstk(Mingw64): 10907 // Clobbers R10, R11, RAX and EFLAGS. 10908 // Updates RSP. 10909 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10910 .addExternalSymbol("___chkstk") 10911 .addReg(X86::RAX, RegState::Implicit) 10912 .addReg(X86::RSP, RegState::Implicit) 10913 .addReg(X86::RAX, RegState::Define | RegState::Implicit) 10914 .addReg(X86::RSP, RegState::Define | RegState::Implicit) 10915 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10916 } else { 10917 // __chkstk(MSVCRT): does not update stack pointer. 10918 // Clobbers R10, R11 and EFLAGS. 10919 // FIXME: RAX(allocated size) might be reused and not killed. 10920 BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) 10921 .addExternalSymbol("__chkstk") 10922 .addReg(X86::RAX, RegState::Implicit) 10923 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10924 // RAX has the offset to subtracted from RSP. 10925 BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) 10926 .addReg(X86::RSP) 10927 .addReg(X86::RAX); 10928 } 10929 } else { 10930 const char *StackProbeSymbol = 10931 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 10932 10933 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 10934 .addExternalSymbol(StackProbeSymbol) 10935 .addReg(X86::EAX, RegState::Implicit) 10936 .addReg(X86::ESP, RegState::Implicit) 10937 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 10938 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 10939 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 10940 } 10941 10942 MI->eraseFromParent(); // The pseudo instruction is gone now. 10943 return BB; 10944} 10945 10946MachineBasicBlock * 10947X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 10948 MachineBasicBlock *BB) const { 10949 // This is pretty easy. We're taking the value that we received from 10950 // our load from the relocation, sticking it in either RDI (x86-64) 10951 // or EAX and doing an indirect call. The return value will then 10952 // be in the normal return register. 10953 const X86InstrInfo *TII 10954 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 10955 DebugLoc DL = MI->getDebugLoc(); 10956 MachineFunction *F = BB->getParent(); 10957 10958 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 10959 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 10960 10961 if (Subtarget->is64Bit()) { 10962 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10963 TII->get(X86::MOV64rm), X86::RDI) 10964 .addReg(X86::RIP) 10965 .addImm(0).addReg(0) 10966 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10967 MI->getOperand(3).getTargetFlags()) 10968 .addReg(0); 10969 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 10970 addDirectMem(MIB, X86::RDI); 10971 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 10972 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10973 TII->get(X86::MOV32rm), X86::EAX) 10974 .addReg(0) 10975 .addImm(0).addReg(0) 10976 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10977 MI->getOperand(3).getTargetFlags()) 10978 .addReg(0); 10979 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10980 addDirectMem(MIB, X86::EAX); 10981 } else { 10982 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 10983 TII->get(X86::MOV32rm), X86::EAX) 10984 .addReg(TII->getGlobalBaseReg(F)) 10985 .addImm(0).addReg(0) 10986 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 10987 MI->getOperand(3).getTargetFlags()) 10988 .addReg(0); 10989 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 10990 addDirectMem(MIB, X86::EAX); 10991 } 10992 10993 MI->eraseFromParent(); // The pseudo instruction is gone now. 10994 return BB; 10995} 10996 10997MachineBasicBlock * 10998X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 10999 MachineBasicBlock *BB) const { 11000 switch (MI->getOpcode()) { 11001 default: assert(false && "Unexpected instr type to insert"); 11002 case X86::TAILJMPd64: 11003 case X86::TAILJMPr64: 11004 case X86::TAILJMPm64: 11005 assert(!"TAILJMP64 would not be touched here."); 11006 case X86::TCRETURNdi64: 11007 case X86::TCRETURNri64: 11008 case X86::TCRETURNmi64: 11009 // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. 11010 // On AMD64, additional defs should be added before register allocation. 11011 if (!Subtarget->isTargetWin64()) { 11012 MI->addRegisterDefined(X86::RSI); 11013 MI->addRegisterDefined(X86::RDI); 11014 MI->addRegisterDefined(X86::XMM6); 11015 MI->addRegisterDefined(X86::XMM7); 11016 MI->addRegisterDefined(X86::XMM8); 11017 MI->addRegisterDefined(X86::XMM9); 11018 MI->addRegisterDefined(X86::XMM10); 11019 MI->addRegisterDefined(X86::XMM11); 11020 MI->addRegisterDefined(X86::XMM12); 11021 MI->addRegisterDefined(X86::XMM13); 11022 MI->addRegisterDefined(X86::XMM14); 11023 MI->addRegisterDefined(X86::XMM15); 11024 } 11025 return BB; 11026 case X86::WIN_ALLOCA: 11027 return EmitLoweredWinAlloca(MI, BB); 11028 case X86::TLSCall_32: 11029 case X86::TLSCall_64: 11030 return EmitLoweredTLSCall(MI, BB); 11031 case X86::CMOV_GR8: 11032 case X86::CMOV_FR32: 11033 case X86::CMOV_FR64: 11034 case X86::CMOV_V4F32: 11035 case X86::CMOV_V2F64: 11036 case X86::CMOV_V2I64: 11037 case X86::CMOV_GR16: 11038 case X86::CMOV_GR32: 11039 case X86::CMOV_RFP32: 11040 case X86::CMOV_RFP64: 11041 case X86::CMOV_RFP80: 11042 return EmitLoweredSelect(MI, BB); 11043 11044 case X86::FP32_TO_INT16_IN_MEM: 11045 case X86::FP32_TO_INT32_IN_MEM: 11046 case X86::FP32_TO_INT64_IN_MEM: 11047 case X86::FP64_TO_INT16_IN_MEM: 11048 case X86::FP64_TO_INT32_IN_MEM: 11049 case X86::FP64_TO_INT64_IN_MEM: 11050 case X86::FP80_TO_INT16_IN_MEM: 11051 case X86::FP80_TO_INT32_IN_MEM: 11052 case X86::FP80_TO_INT64_IN_MEM: { 11053 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 11054 DebugLoc DL = MI->getDebugLoc(); 11055 11056 // Change the floating point control register to use "round towards zero" 11057 // mode when truncating to an integer value. 11058 MachineFunction *F = BB->getParent(); 11059 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 11060 addFrameReference(BuildMI(*BB, MI, DL, 11061 TII->get(X86::FNSTCW16m)), CWFrameIdx); 11062 11063 // Load the old value of the high byte of the control word... 11064 unsigned OldCW = 11065 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 11066 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 11067 CWFrameIdx); 11068 11069 // Set the high part to be round to zero... 11070 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 11071 .addImm(0xC7F); 11072 11073 // Reload the modified control word now... 11074 addFrameReference(BuildMI(*BB, MI, DL, 11075 TII->get(X86::FLDCW16m)), CWFrameIdx); 11076 11077 // Restore the memory image of control word to original value 11078 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 11079 .addReg(OldCW); 11080 11081 // Get the X86 opcode to use. 11082 unsigned Opc; 11083 switch (MI->getOpcode()) { 11084 default: llvm_unreachable("illegal opcode!"); 11085 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 11086 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 11087 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 11088 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 11089 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 11090 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 11091 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 11092 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 11093 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 11094 } 11095 11096 X86AddressMode AM; 11097 MachineOperand &Op = MI->getOperand(0); 11098 if (Op.isReg()) { 11099 AM.BaseType = X86AddressMode::RegBase; 11100 AM.Base.Reg = Op.getReg(); 11101 } else { 11102 AM.BaseType = X86AddressMode::FrameIndexBase; 11103 AM.Base.FrameIndex = Op.getIndex(); 11104 } 11105 Op = MI->getOperand(1); 11106 if (Op.isImm()) 11107 AM.Scale = Op.getImm(); 11108 Op = MI->getOperand(2); 11109 if (Op.isImm()) 11110 AM.IndexReg = Op.getImm(); 11111 Op = MI->getOperand(3); 11112 if (Op.isGlobal()) { 11113 AM.GV = Op.getGlobal(); 11114 } else { 11115 AM.Disp = Op.getImm(); 11116 } 11117 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 11118 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 11119 11120 // Reload the original control word now. 11121 addFrameReference(BuildMI(*BB, MI, DL, 11122 TII->get(X86::FLDCW16m)), CWFrameIdx); 11123 11124 MI->eraseFromParent(); // The pseudo instruction is gone now. 11125 return BB; 11126 } 11127 // String/text processing lowering. 11128 case X86::PCMPISTRM128REG: 11129 case X86::VPCMPISTRM128REG: 11130 return EmitPCMP(MI, BB, 3, false /* in-mem */); 11131 case X86::PCMPISTRM128MEM: 11132 case X86::VPCMPISTRM128MEM: 11133 return EmitPCMP(MI, BB, 3, true /* in-mem */); 11134 case X86::PCMPESTRM128REG: 11135 case X86::VPCMPESTRM128REG: 11136 return EmitPCMP(MI, BB, 5, false /* in mem */); 11137 case X86::PCMPESTRM128MEM: 11138 case X86::VPCMPESTRM128MEM: 11139 return EmitPCMP(MI, BB, 5, true /* in mem */); 11140 11141 // Thread synchronization. 11142 case X86::MONITOR: 11143 return EmitMonitor(MI, BB); 11144 case X86::MWAIT: 11145 return EmitMwait(MI, BB); 11146 11147 // Atomic Lowering. 11148 case X86::ATOMAND32: 11149 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11150 X86::AND32ri, X86::MOV32rm, 11151 X86::LCMPXCHG32, 11152 X86::NOT32r, X86::EAX, 11153 X86::GR32RegisterClass); 11154 case X86::ATOMOR32: 11155 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 11156 X86::OR32ri, X86::MOV32rm, 11157 X86::LCMPXCHG32, 11158 X86::NOT32r, X86::EAX, 11159 X86::GR32RegisterClass); 11160 case X86::ATOMXOR32: 11161 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 11162 X86::XOR32ri, X86::MOV32rm, 11163 X86::LCMPXCHG32, 11164 X86::NOT32r, X86::EAX, 11165 X86::GR32RegisterClass); 11166 case X86::ATOMNAND32: 11167 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 11168 X86::AND32ri, X86::MOV32rm, 11169 X86::LCMPXCHG32, 11170 X86::NOT32r, X86::EAX, 11171 X86::GR32RegisterClass, true); 11172 case X86::ATOMMIN32: 11173 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 11174 case X86::ATOMMAX32: 11175 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 11176 case X86::ATOMUMIN32: 11177 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 11178 case X86::ATOMUMAX32: 11179 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 11180 11181 case X86::ATOMAND16: 11182 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11183 X86::AND16ri, X86::MOV16rm, 11184 X86::LCMPXCHG16, 11185 X86::NOT16r, X86::AX, 11186 X86::GR16RegisterClass); 11187 case X86::ATOMOR16: 11188 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 11189 X86::OR16ri, X86::MOV16rm, 11190 X86::LCMPXCHG16, 11191 X86::NOT16r, X86::AX, 11192 X86::GR16RegisterClass); 11193 case X86::ATOMXOR16: 11194 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 11195 X86::XOR16ri, X86::MOV16rm, 11196 X86::LCMPXCHG16, 11197 X86::NOT16r, X86::AX, 11198 X86::GR16RegisterClass); 11199 case X86::ATOMNAND16: 11200 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 11201 X86::AND16ri, X86::MOV16rm, 11202 X86::LCMPXCHG16, 11203 X86::NOT16r, X86::AX, 11204 X86::GR16RegisterClass, true); 11205 case X86::ATOMMIN16: 11206 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 11207 case X86::ATOMMAX16: 11208 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 11209 case X86::ATOMUMIN16: 11210 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 11211 case X86::ATOMUMAX16: 11212 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 11213 11214 case X86::ATOMAND8: 11215 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11216 X86::AND8ri, X86::MOV8rm, 11217 X86::LCMPXCHG8, 11218 X86::NOT8r, X86::AL, 11219 X86::GR8RegisterClass); 11220 case X86::ATOMOR8: 11221 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 11222 X86::OR8ri, X86::MOV8rm, 11223 X86::LCMPXCHG8, 11224 X86::NOT8r, X86::AL, 11225 X86::GR8RegisterClass); 11226 case X86::ATOMXOR8: 11227 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 11228 X86::XOR8ri, X86::MOV8rm, 11229 X86::LCMPXCHG8, 11230 X86::NOT8r, X86::AL, 11231 X86::GR8RegisterClass); 11232 case X86::ATOMNAND8: 11233 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 11234 X86::AND8ri, X86::MOV8rm, 11235 X86::LCMPXCHG8, 11236 X86::NOT8r, X86::AL, 11237 X86::GR8RegisterClass, true); 11238 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 11239 // This group is for 64-bit host. 11240 case X86::ATOMAND64: 11241 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11242 X86::AND64ri32, X86::MOV64rm, 11243 X86::LCMPXCHG64, 11244 X86::NOT64r, X86::RAX, 11245 X86::GR64RegisterClass); 11246 case X86::ATOMOR64: 11247 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 11248 X86::OR64ri32, X86::MOV64rm, 11249 X86::LCMPXCHG64, 11250 X86::NOT64r, X86::RAX, 11251 X86::GR64RegisterClass); 11252 case X86::ATOMXOR64: 11253 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 11254 X86::XOR64ri32, X86::MOV64rm, 11255 X86::LCMPXCHG64, 11256 X86::NOT64r, X86::RAX, 11257 X86::GR64RegisterClass); 11258 case X86::ATOMNAND64: 11259 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 11260 X86::AND64ri32, X86::MOV64rm, 11261 X86::LCMPXCHG64, 11262 X86::NOT64r, X86::RAX, 11263 X86::GR64RegisterClass, true); 11264 case X86::ATOMMIN64: 11265 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 11266 case X86::ATOMMAX64: 11267 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 11268 case X86::ATOMUMIN64: 11269 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 11270 case X86::ATOMUMAX64: 11271 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 11272 11273 // This group does 64-bit operations on a 32-bit host. 11274 case X86::ATOMAND6432: 11275 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11276 X86::AND32rr, X86::AND32rr, 11277 X86::AND32ri, X86::AND32ri, 11278 false); 11279 case X86::ATOMOR6432: 11280 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11281 X86::OR32rr, X86::OR32rr, 11282 X86::OR32ri, X86::OR32ri, 11283 false); 11284 case X86::ATOMXOR6432: 11285 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11286 X86::XOR32rr, X86::XOR32rr, 11287 X86::XOR32ri, X86::XOR32ri, 11288 false); 11289 case X86::ATOMNAND6432: 11290 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11291 X86::AND32rr, X86::AND32rr, 11292 X86::AND32ri, X86::AND32ri, 11293 true); 11294 case X86::ATOMADD6432: 11295 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11296 X86::ADD32rr, X86::ADC32rr, 11297 X86::ADD32ri, X86::ADC32ri, 11298 false); 11299 case X86::ATOMSUB6432: 11300 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11301 X86::SUB32rr, X86::SBB32rr, 11302 X86::SUB32ri, X86::SBB32ri, 11303 false); 11304 case X86::ATOMSWAP6432: 11305 return EmitAtomicBit6432WithCustomInserter(MI, BB, 11306 X86::MOV32rr, X86::MOV32rr, 11307 X86::MOV32ri, X86::MOV32ri, 11308 false); 11309 case X86::VASTART_SAVE_XMM_REGS: 11310 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 11311 11312 case X86::VAARG_64: 11313 return EmitVAARG64WithCustomInserter(MI, BB); 11314 } 11315} 11316 11317//===----------------------------------------------------------------------===// 11318// X86 Optimization Hooks 11319//===----------------------------------------------------------------------===// 11320 11321void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 11322 const APInt &Mask, 11323 APInt &KnownZero, 11324 APInt &KnownOne, 11325 const SelectionDAG &DAG, 11326 unsigned Depth) const { 11327 unsigned Opc = Op.getOpcode(); 11328 assert((Opc >= ISD::BUILTIN_OP_END || 11329 Opc == ISD::INTRINSIC_WO_CHAIN || 11330 Opc == ISD::INTRINSIC_W_CHAIN || 11331 Opc == ISD::INTRINSIC_VOID) && 11332 "Should use MaskedValueIsZero if you don't know whether Op" 11333 " is a target node!"); 11334 11335 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 11336 switch (Opc) { 11337 default: break; 11338 case X86ISD::ADD: 11339 case X86ISD::SUB: 11340 case X86ISD::ADC: 11341 case X86ISD::SBB: 11342 case X86ISD::SMUL: 11343 case X86ISD::UMUL: 11344 case X86ISD::INC: 11345 case X86ISD::DEC: 11346 case X86ISD::OR: 11347 case X86ISD::XOR: 11348 case X86ISD::AND: 11349 // These nodes' second result is a boolean. 11350 if (Op.getResNo() == 0) 11351 break; 11352 // Fallthrough 11353 case X86ISD::SETCC: 11354 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 11355 Mask.getBitWidth() - 1); 11356 break; 11357 } 11358} 11359 11360unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 11361 unsigned Depth) const { 11362 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 11363 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 11364 return Op.getValueType().getScalarType().getSizeInBits(); 11365 11366 // Fallback case. 11367 return 1; 11368} 11369 11370/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 11371/// node is a GlobalAddress + offset. 11372bool X86TargetLowering::isGAPlusOffset(SDNode *N, 11373 const GlobalValue* &GA, 11374 int64_t &Offset) const { 11375 if (N->getOpcode() == X86ISD::Wrapper) { 11376 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 11377 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 11378 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 11379 return true; 11380 } 11381 } 11382 return TargetLowering::isGAPlusOffset(N, GA, Offset); 11383} 11384 11385/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. 11386static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, 11387 TargetLowering::DAGCombinerInfo &DCI) { 11388 DebugLoc dl = N->getDebugLoc(); 11389 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 11390 SDValue V1 = SVOp->getOperand(0); 11391 SDValue V2 = SVOp->getOperand(1); 11392 EVT VT = SVOp->getValueType(0); 11393 11394 if (V1.getOpcode() == ISD::CONCAT_VECTORS && 11395 V2.getOpcode() == ISD::CONCAT_VECTORS) { 11396 // 11397 // 0,0,0,... 11398 // | 11399 // V UNDEF BUILD_VECTOR UNDEF 11400 // \ / \ / 11401 // CONCAT_VECTOR CONCAT_VECTOR 11402 // \ / 11403 // \ / 11404 // RESULT: V + zero extended 11405 // 11406 if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || 11407 V2.getOperand(1).getOpcode() != ISD::UNDEF || 11408 V1.getOperand(1).getOpcode() != ISD::UNDEF) 11409 return SDValue(); 11410 11411 if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) 11412 return SDValue(); 11413 11414 // To match the shuffle mask, the first half of the mask should 11415 // be exactly the first vector, and all the rest a splat with the 11416 // first element of the second one. 11417 int NumElems = VT.getVectorNumElements(); 11418 for (int i = 0; i < NumElems/2; ++i) 11419 if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || 11420 !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) 11421 return SDValue(); 11422 11423 // Emit a zeroed vector and insert the desired subvector on its 11424 // first half. 11425 SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl); 11426 SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 11427 DAG.getConstant(0, MVT::i32), DAG, dl); 11428 return DCI.CombineTo(N, InsV); 11429 } 11430 11431 return SDValue(); 11432} 11433 11434/// PerformShuffleCombine - Performs several different shuffle combines. 11435static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 11436 TargetLowering::DAGCombinerInfo &DCI) { 11437 DebugLoc dl = N->getDebugLoc(); 11438 EVT VT = N->getValueType(0); 11439 11440 // Don't create instructions with illegal types after legalize types has run. 11441 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11442 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) 11443 return SDValue(); 11444 11445 // Only handle pure VECTOR_SHUFFLE nodes. 11446 if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) 11447 return PerformShuffleCombine256(N, DAG, DCI); 11448 11449 // Only handle 128 wide vector from here on. 11450 if (VT.getSizeInBits() != 128) 11451 return SDValue(); 11452 11453 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, 11454 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are 11455 // consecutive, non-overlapping, and in the right order. 11456 SmallVector<SDValue, 16> Elts; 11457 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 11458 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 11459 11460 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 11461} 11462 11463/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 11464/// generation and convert it from being a bunch of shuffles and extracts 11465/// to a simple store and scalar loads to extract the elements. 11466static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 11467 const TargetLowering &TLI) { 11468 SDValue InputVector = N->getOperand(0); 11469 11470 // Only operate on vectors of 4 elements, where the alternative shuffling 11471 // gets to be more expensive. 11472 if (InputVector.getValueType() != MVT::v4i32) 11473 return SDValue(); 11474 11475 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 11476 // single use which is a sign-extend or zero-extend, and all elements are 11477 // used. 11478 SmallVector<SDNode *, 4> Uses; 11479 unsigned ExtractedElements = 0; 11480 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 11481 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 11482 if (UI.getUse().getResNo() != InputVector.getResNo()) 11483 return SDValue(); 11484 11485 SDNode *Extract = *UI; 11486 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 11487 return SDValue(); 11488 11489 if (Extract->getValueType(0) != MVT::i32) 11490 return SDValue(); 11491 if (!Extract->hasOneUse()) 11492 return SDValue(); 11493 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 11494 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 11495 return SDValue(); 11496 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 11497 return SDValue(); 11498 11499 // Record which element was extracted. 11500 ExtractedElements |= 11501 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 11502 11503 Uses.push_back(Extract); 11504 } 11505 11506 // If not all the elements were used, this may not be worthwhile. 11507 if (ExtractedElements != 15) 11508 return SDValue(); 11509 11510 // Ok, we've now decided to do the transformation. 11511 DebugLoc dl = InputVector.getDebugLoc(); 11512 11513 // Store the value to a temporary stack slot. 11514 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 11515 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 11516 MachinePointerInfo(), false, false, 0); 11517 11518 // Replace each use (extract) with a load of the appropriate element. 11519 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 11520 UE = Uses.end(); UI != UE; ++UI) { 11521 SDNode *Extract = *UI; 11522 11523 // cOMpute the element's address. 11524 SDValue Idx = Extract->getOperand(1); 11525 unsigned EltSize = 11526 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 11527 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 11528 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 11529 11530 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), 11531 StackPtr, OffsetVal); 11532 11533 // Load the scalar. 11534 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 11535 ScalarAddr, MachinePointerInfo(), 11536 false, false, 0); 11537 11538 // Replace the exact with the load. 11539 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 11540 } 11541 11542 // The replacement was made in place; don't return anything. 11543 return SDValue(); 11544} 11545 11546/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 11547static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 11548 const X86Subtarget *Subtarget) { 11549 DebugLoc DL = N->getDebugLoc(); 11550 SDValue Cond = N->getOperand(0); 11551 // Get the LHS/RHS of the select. 11552 SDValue LHS = N->getOperand(1); 11553 SDValue RHS = N->getOperand(2); 11554 11555 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 11556 // instructions match the semantics of the common C idiom x<y?x:y but not 11557 // x<=y?x:y, because of how they handle negative zero (which can be 11558 // ignored in unsafe-math mode). 11559 if (Subtarget->hasSSE2() && 11560 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 11561 Cond.getOpcode() == ISD::SETCC) { 11562 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 11563 11564 unsigned Opcode = 0; 11565 // Check for x CC y ? x : y. 11566 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 11567 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 11568 switch (CC) { 11569 default: break; 11570 case ISD::SETULT: 11571 // Converting this to a min would handle NaNs incorrectly, and swapping 11572 // the operands would cause it to handle comparisons between positive 11573 // and negative zero incorrectly. 11574 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11575 if (!UnsafeFPMath && 11576 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11577 break; 11578 std::swap(LHS, RHS); 11579 } 11580 Opcode = X86ISD::FMIN; 11581 break; 11582 case ISD::SETOLE: 11583 // Converting this to a min would handle comparisons between positive 11584 // and negative zero incorrectly. 11585 if (!UnsafeFPMath && 11586 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 11587 break; 11588 Opcode = X86ISD::FMIN; 11589 break; 11590 case ISD::SETULE: 11591 // Converting this to a min would handle both negative zeros and NaNs 11592 // incorrectly, but we can swap the operands to fix both. 11593 std::swap(LHS, RHS); 11594 case ISD::SETOLT: 11595 case ISD::SETLT: 11596 case ISD::SETLE: 11597 Opcode = X86ISD::FMIN; 11598 break; 11599 11600 case ISD::SETOGE: 11601 // Converting this to a max would handle comparisons between positive 11602 // and negative zero incorrectly. 11603 if (!UnsafeFPMath && 11604 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 11605 break; 11606 Opcode = X86ISD::FMAX; 11607 break; 11608 case ISD::SETUGT: 11609 // Converting this to a max would handle NaNs incorrectly, and swapping 11610 // the operands would cause it to handle comparisons between positive 11611 // and negative zero incorrectly. 11612 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 11613 if (!UnsafeFPMath && 11614 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 11615 break; 11616 std::swap(LHS, RHS); 11617 } 11618 Opcode = X86ISD::FMAX; 11619 break; 11620 case ISD::SETUGE: 11621 // Converting this to a max would handle both negative zeros and NaNs 11622 // incorrectly, but we can swap the operands to fix both. 11623 std::swap(LHS, RHS); 11624 case ISD::SETOGT: 11625 case ISD::SETGT: 11626 case ISD::SETGE: 11627 Opcode = X86ISD::FMAX; 11628 break; 11629 } 11630 // Check for x CC y ? y : x -- a min/max with reversed arms. 11631 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 11632 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 11633 switch (CC) { 11634 default: break; 11635 case ISD::SETOGE: 11636 // Converting this to a min would handle comparisons between positive 11637 // and negative zero incorrectly, and swapping the operands would 11638 // cause it to handle NaNs incorrectly. 11639 if (!UnsafeFPMath && 11640 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 11641 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11642 break; 11643 std::swap(LHS, RHS); 11644 } 11645 Opcode = X86ISD::FMIN; 11646 break; 11647 case ISD::SETUGT: 11648 // Converting this to a min would handle NaNs incorrectly. 11649 if (!UnsafeFPMath && 11650 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 11651 break; 11652 Opcode = X86ISD::FMIN; 11653 break; 11654 case ISD::SETUGE: 11655 // Converting this to a min would handle both negative zeros and NaNs 11656 // incorrectly, but we can swap the operands to fix both. 11657 std::swap(LHS, RHS); 11658 case ISD::SETOGT: 11659 case ISD::SETGT: 11660 case ISD::SETGE: 11661 Opcode = X86ISD::FMIN; 11662 break; 11663 11664 case ISD::SETULT: 11665 // Converting this to a max would handle NaNs incorrectly. 11666 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11667 break; 11668 Opcode = X86ISD::FMAX; 11669 break; 11670 case ISD::SETOLE: 11671 // Converting this to a max would handle comparisons between positive 11672 // and negative zero incorrectly, and swapping the operands would 11673 // cause it to handle NaNs incorrectly. 11674 if (!UnsafeFPMath && 11675 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 11676 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 11677 break; 11678 std::swap(LHS, RHS); 11679 } 11680 Opcode = X86ISD::FMAX; 11681 break; 11682 case ISD::SETULE: 11683 // Converting this to a max would handle both negative zeros and NaNs 11684 // incorrectly, but we can swap the operands to fix both. 11685 std::swap(LHS, RHS); 11686 case ISD::SETOLT: 11687 case ISD::SETLT: 11688 case ISD::SETLE: 11689 Opcode = X86ISD::FMAX; 11690 break; 11691 } 11692 } 11693 11694 if (Opcode) 11695 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 11696 } 11697 11698 // If this is a select between two integer constants, try to do some 11699 // optimizations. 11700 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 11701 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 11702 // Don't do this for crazy integer types. 11703 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 11704 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 11705 // so that TrueC (the true value) is larger than FalseC. 11706 bool NeedsCondInvert = false; 11707 11708 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 11709 // Efficiently invertible. 11710 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 11711 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 11712 isa<ConstantSDNode>(Cond.getOperand(1))))) { 11713 NeedsCondInvert = true; 11714 std::swap(TrueC, FalseC); 11715 } 11716 11717 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 11718 if (FalseC->getAPIntValue() == 0 && 11719 TrueC->getAPIntValue().isPowerOf2()) { 11720 if (NeedsCondInvert) // Invert the condition if needed. 11721 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11722 DAG.getConstant(1, Cond.getValueType())); 11723 11724 // Zero extend the condition if needed. 11725 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 11726 11727 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11728 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 11729 DAG.getConstant(ShAmt, MVT::i8)); 11730 } 11731 11732 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 11733 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11734 if (NeedsCondInvert) // Invert the condition if needed. 11735 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11736 DAG.getConstant(1, Cond.getValueType())); 11737 11738 // Zero extend the condition if needed. 11739 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11740 FalseC->getValueType(0), Cond); 11741 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11742 SDValue(FalseC, 0)); 11743 } 11744 11745 // Optimize cases that will turn into an LEA instruction. This requires 11746 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11747 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11748 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11749 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11750 11751 bool isFastMultiplier = false; 11752 if (Diff < 10) { 11753 switch ((unsigned char)Diff) { 11754 default: break; 11755 case 1: // result = add base, cond 11756 case 2: // result = lea base( , cond*2) 11757 case 3: // result = lea base(cond, cond*2) 11758 case 4: // result = lea base( , cond*4) 11759 case 5: // result = lea base(cond, cond*4) 11760 case 8: // result = lea base( , cond*8) 11761 case 9: // result = lea base(cond, cond*8) 11762 isFastMultiplier = true; 11763 break; 11764 } 11765 } 11766 11767 if (isFastMultiplier) { 11768 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11769 if (NeedsCondInvert) // Invert the condition if needed. 11770 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 11771 DAG.getConstant(1, Cond.getValueType())); 11772 11773 // Zero extend the condition if needed. 11774 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11775 Cond); 11776 // Scale the condition by the difference. 11777 if (Diff != 1) 11778 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11779 DAG.getConstant(Diff, Cond.getValueType())); 11780 11781 // Add the base if non-zero. 11782 if (FalseC->getAPIntValue() != 0) 11783 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11784 SDValue(FalseC, 0)); 11785 return Cond; 11786 } 11787 } 11788 } 11789 } 11790 11791 return SDValue(); 11792} 11793 11794/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 11795static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 11796 TargetLowering::DAGCombinerInfo &DCI) { 11797 DebugLoc DL = N->getDebugLoc(); 11798 11799 // If the flag operand isn't dead, don't touch this CMOV. 11800 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 11801 return SDValue(); 11802 11803 SDValue FalseOp = N->getOperand(0); 11804 SDValue TrueOp = N->getOperand(1); 11805 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 11806 SDValue Cond = N->getOperand(3); 11807 if (CC == X86::COND_E || CC == X86::COND_NE) { 11808 switch (Cond.getOpcode()) { 11809 default: break; 11810 case X86ISD::BSR: 11811 case X86ISD::BSF: 11812 // If operand of BSR / BSF are proven never zero, then ZF cannot be set. 11813 if (DAG.isKnownNeverZero(Cond.getOperand(0))) 11814 return (CC == X86::COND_E) ? FalseOp : TrueOp; 11815 } 11816 } 11817 11818 // If this is a select between two integer constants, try to do some 11819 // optimizations. Note that the operands are ordered the opposite of SELECT 11820 // operands. 11821 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { 11822 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { 11823 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 11824 // larger than FalseC (the false value). 11825 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 11826 CC = X86::GetOppositeBranchCondition(CC); 11827 std::swap(TrueC, FalseC); 11828 } 11829 11830 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 11831 // This is efficient for any integer data type (including i8/i16) and 11832 // shift amount. 11833 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 11834 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11835 DAG.getConstant(CC, MVT::i8), Cond); 11836 11837 // Zero extend the condition if needed. 11838 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 11839 11840 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 11841 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 11842 DAG.getConstant(ShAmt, MVT::i8)); 11843 if (N->getNumValues() == 2) // Dead flag value? 11844 return DCI.CombineTo(N, Cond, SDValue()); 11845 return Cond; 11846 } 11847 11848 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 11849 // for any integer data type, including i8/i16. 11850 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 11851 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11852 DAG.getConstant(CC, MVT::i8), Cond); 11853 11854 // Zero extend the condition if needed. 11855 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 11856 FalseC->getValueType(0), Cond); 11857 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11858 SDValue(FalseC, 0)); 11859 11860 if (N->getNumValues() == 2) // Dead flag value? 11861 return DCI.CombineTo(N, Cond, SDValue()); 11862 return Cond; 11863 } 11864 11865 // Optimize cases that will turn into an LEA instruction. This requires 11866 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 11867 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 11868 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 11869 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 11870 11871 bool isFastMultiplier = false; 11872 if (Diff < 10) { 11873 switch ((unsigned char)Diff) { 11874 default: break; 11875 case 1: // result = add base, cond 11876 case 2: // result = lea base( , cond*2) 11877 case 3: // result = lea base(cond, cond*2) 11878 case 4: // result = lea base( , cond*4) 11879 case 5: // result = lea base(cond, cond*4) 11880 case 8: // result = lea base( , cond*8) 11881 case 9: // result = lea base(cond, cond*8) 11882 isFastMultiplier = true; 11883 break; 11884 } 11885 } 11886 11887 if (isFastMultiplier) { 11888 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 11889 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 11890 DAG.getConstant(CC, MVT::i8), Cond); 11891 // Zero extend the condition if needed. 11892 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 11893 Cond); 11894 // Scale the condition by the difference. 11895 if (Diff != 1) 11896 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 11897 DAG.getConstant(Diff, Cond.getValueType())); 11898 11899 // Add the base if non-zero. 11900 if (FalseC->getAPIntValue() != 0) 11901 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 11902 SDValue(FalseC, 0)); 11903 if (N->getNumValues() == 2) // Dead flag value? 11904 return DCI.CombineTo(N, Cond, SDValue()); 11905 return Cond; 11906 } 11907 } 11908 } 11909 } 11910 return SDValue(); 11911} 11912 11913 11914/// PerformMulCombine - Optimize a single multiply with constant into two 11915/// in order to implement it with two cheaper instructions, e.g. 11916/// LEA + SHL, LEA + LEA. 11917static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 11918 TargetLowering::DAGCombinerInfo &DCI) { 11919 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 11920 return SDValue(); 11921 11922 EVT VT = N->getValueType(0); 11923 if (VT != MVT::i64) 11924 return SDValue(); 11925 11926 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 11927 if (!C) 11928 return SDValue(); 11929 uint64_t MulAmt = C->getZExtValue(); 11930 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 11931 return SDValue(); 11932 11933 uint64_t MulAmt1 = 0; 11934 uint64_t MulAmt2 = 0; 11935 if ((MulAmt % 9) == 0) { 11936 MulAmt1 = 9; 11937 MulAmt2 = MulAmt / 9; 11938 } else if ((MulAmt % 5) == 0) { 11939 MulAmt1 = 5; 11940 MulAmt2 = MulAmt / 5; 11941 } else if ((MulAmt % 3) == 0) { 11942 MulAmt1 = 3; 11943 MulAmt2 = MulAmt / 3; 11944 } 11945 if (MulAmt2 && 11946 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 11947 DebugLoc DL = N->getDebugLoc(); 11948 11949 if (isPowerOf2_64(MulAmt2) && 11950 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 11951 // If second multiplifer is pow2, issue it first. We want the multiply by 11952 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 11953 // is an add. 11954 std::swap(MulAmt1, MulAmt2); 11955 11956 SDValue NewMul; 11957 if (isPowerOf2_64(MulAmt1)) 11958 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 11959 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 11960 else 11961 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 11962 DAG.getConstant(MulAmt1, VT)); 11963 11964 if (isPowerOf2_64(MulAmt2)) 11965 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 11966 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 11967 else 11968 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 11969 DAG.getConstant(MulAmt2, VT)); 11970 11971 // Do not add new nodes to DAG combiner worklist. 11972 DCI.CombineTo(N, NewMul, false); 11973 } 11974 return SDValue(); 11975} 11976 11977static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 11978 SDValue N0 = N->getOperand(0); 11979 SDValue N1 = N->getOperand(1); 11980 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 11981 EVT VT = N0.getValueType(); 11982 11983 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 11984 // since the result of setcc_c is all zero's or all ones. 11985 if (N1C && N0.getOpcode() == ISD::AND && 11986 N0.getOperand(1).getOpcode() == ISD::Constant) { 11987 SDValue N00 = N0.getOperand(0); 11988 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 11989 ((N00.getOpcode() == ISD::ANY_EXTEND || 11990 N00.getOpcode() == ISD::ZERO_EXTEND) && 11991 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 11992 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 11993 APInt ShAmt = N1C->getAPIntValue(); 11994 Mask = Mask.shl(ShAmt); 11995 if (Mask != 0) 11996 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 11997 N00, DAG.getConstant(Mask, VT)); 11998 } 11999 } 12000 12001 return SDValue(); 12002} 12003 12004/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 12005/// when possible. 12006static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 12007 const X86Subtarget *Subtarget) { 12008 EVT VT = N->getValueType(0); 12009 if (!VT.isVector() && VT.isInteger() && 12010 N->getOpcode() == ISD::SHL) 12011 return PerformSHLCombine(N, DAG); 12012 12013 // On X86 with SSE2 support, we can transform this to a vector shift if 12014 // all elements are shifted by the same amount. We can't do this in legalize 12015 // because the a constant vector is typically transformed to a constant pool 12016 // so we have no knowledge of the shift amount. 12017 if (!Subtarget->hasSSE2()) 12018 return SDValue(); 12019 12020 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 12021 return SDValue(); 12022 12023 SDValue ShAmtOp = N->getOperand(1); 12024 EVT EltVT = VT.getVectorElementType(); 12025 DebugLoc DL = N->getDebugLoc(); 12026 SDValue BaseShAmt = SDValue(); 12027 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 12028 unsigned NumElts = VT.getVectorNumElements(); 12029 unsigned i = 0; 12030 for (; i != NumElts; ++i) { 12031 SDValue Arg = ShAmtOp.getOperand(i); 12032 if (Arg.getOpcode() == ISD::UNDEF) continue; 12033 BaseShAmt = Arg; 12034 break; 12035 } 12036 for (; i != NumElts; ++i) { 12037 SDValue Arg = ShAmtOp.getOperand(i); 12038 if (Arg.getOpcode() == ISD::UNDEF) continue; 12039 if (Arg != BaseShAmt) { 12040 return SDValue(); 12041 } 12042 } 12043 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 12044 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 12045 SDValue InVec = ShAmtOp.getOperand(0); 12046 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 12047 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 12048 unsigned i = 0; 12049 for (; i != NumElts; ++i) { 12050 SDValue Arg = InVec.getOperand(i); 12051 if (Arg.getOpcode() == ISD::UNDEF) continue; 12052 BaseShAmt = Arg; 12053 break; 12054 } 12055 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 12056 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 12057 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 12058 if (C->getZExtValue() == SplatIdx) 12059 BaseShAmt = InVec.getOperand(1); 12060 } 12061 } 12062 if (BaseShAmt.getNode() == 0) 12063 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 12064 DAG.getIntPtrConstant(0)); 12065 } else 12066 return SDValue(); 12067 12068 // The shift amount is an i32. 12069 if (EltVT.bitsGT(MVT::i32)) 12070 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 12071 else if (EltVT.bitsLT(MVT::i32)) 12072 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 12073 12074 // The shift amount is identical so we can do a vector shift. 12075 SDValue ValOp = N->getOperand(0); 12076 switch (N->getOpcode()) { 12077 default: 12078 llvm_unreachable("Unknown shift opcode!"); 12079 break; 12080 case ISD::SHL: 12081 if (VT == MVT::v2i64) 12082 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12083 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 12084 ValOp, BaseShAmt); 12085 if (VT == MVT::v4i32) 12086 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12087 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 12088 ValOp, BaseShAmt); 12089 if (VT == MVT::v8i16) 12090 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12091 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 12092 ValOp, BaseShAmt); 12093 break; 12094 case ISD::SRA: 12095 if (VT == MVT::v4i32) 12096 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12097 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 12098 ValOp, BaseShAmt); 12099 if (VT == MVT::v8i16) 12100 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12101 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 12102 ValOp, BaseShAmt); 12103 break; 12104 case ISD::SRL: 12105 if (VT == MVT::v2i64) 12106 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12107 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 12108 ValOp, BaseShAmt); 12109 if (VT == MVT::v4i32) 12110 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12111 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 12112 ValOp, BaseShAmt); 12113 if (VT == MVT::v8i16) 12114 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 12115 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 12116 ValOp, BaseShAmt); 12117 break; 12118 } 12119 return SDValue(); 12120} 12121 12122 12123// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) 12124// where both setccs reference the same FP CMP, and rewrite for CMPEQSS 12125// and friends. Likewise for OR -> CMPNEQSS. 12126static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, 12127 TargetLowering::DAGCombinerInfo &DCI, 12128 const X86Subtarget *Subtarget) { 12129 unsigned opcode; 12130 12131 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but 12132 // we're requiring SSE2 for both. 12133 if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { 12134 SDValue N0 = N->getOperand(0); 12135 SDValue N1 = N->getOperand(1); 12136 SDValue CMP0 = N0->getOperand(1); 12137 SDValue CMP1 = N1->getOperand(1); 12138 DebugLoc DL = N->getDebugLoc(); 12139 12140 // The SETCCs should both refer to the same CMP. 12141 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) 12142 return SDValue(); 12143 12144 SDValue CMP00 = CMP0->getOperand(0); 12145 SDValue CMP01 = CMP0->getOperand(1); 12146 EVT VT = CMP00.getValueType(); 12147 12148 if (VT == MVT::f32 || VT == MVT::f64) { 12149 bool ExpectingFlags = false; 12150 // Check for any users that want flags: 12151 for (SDNode::use_iterator UI = N->use_begin(), 12152 UE = N->use_end(); 12153 !ExpectingFlags && UI != UE; ++UI) 12154 switch (UI->getOpcode()) { 12155 default: 12156 case ISD::BR_CC: 12157 case ISD::BRCOND: 12158 case ISD::SELECT: 12159 ExpectingFlags = true; 12160 break; 12161 case ISD::CopyToReg: 12162 case ISD::SIGN_EXTEND: 12163 case ISD::ZERO_EXTEND: 12164 case ISD::ANY_EXTEND: 12165 break; 12166 } 12167 12168 if (!ExpectingFlags) { 12169 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); 12170 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); 12171 12172 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { 12173 X86::CondCode tmp = cc0; 12174 cc0 = cc1; 12175 cc1 = tmp; 12176 } 12177 12178 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || 12179 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { 12180 bool is64BitFP = (CMP00.getValueType() == MVT::f64); 12181 X86ISD::NodeType NTOperator = is64BitFP ? 12182 X86ISD::FSETCCsd : X86ISD::FSETCCss; 12183 // FIXME: need symbolic constants for these magic numbers. 12184 // See X86ATTInstPrinter.cpp:printSSECC(). 12185 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; 12186 SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, 12187 DAG.getConstant(x86cc, MVT::i8)); 12188 SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, 12189 OnesOrZeroesF); 12190 SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, 12191 DAG.getConstant(1, MVT::i32)); 12192 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); 12193 return OneBitOfTruth; 12194 } 12195 } 12196 } 12197 } 12198 return SDValue(); 12199} 12200 12201/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector 12202/// so it can be folded inside ANDNP. 12203static bool CanFoldXORWithAllOnes(const SDNode *N) { 12204 EVT VT = N->getValueType(0); 12205 12206 // Match direct AllOnes for 128 and 256-bit vectors 12207 if (ISD::isBuildVectorAllOnes(N)) 12208 return true; 12209 12210 // Look through a bit convert. 12211 if (N->getOpcode() == ISD::BITCAST) 12212 N = N->getOperand(0).getNode(); 12213 12214 // Sometimes the operand may come from a insert_subvector building a 256-bit 12215 // allones vector 12216 SDValue V1 = N->getOperand(0); 12217 SDValue V2 = N->getOperand(1); 12218 12219 if (VT.getSizeInBits() == 256 && 12220 N->getOpcode() == ISD::INSERT_SUBVECTOR && 12221 V1.getOpcode() == ISD::INSERT_SUBVECTOR && 12222 V1.getOperand(0).getOpcode() == ISD::UNDEF && 12223 ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && 12224 ISD::isBuildVectorAllOnes(V2.getNode())) 12225 return true; 12226 12227 return false; 12228} 12229 12230static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, 12231 TargetLowering::DAGCombinerInfo &DCI, 12232 const X86Subtarget *Subtarget) { 12233 if (DCI.isBeforeLegalizeOps()) 12234 return SDValue(); 12235 12236 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12237 if (R.getNode()) 12238 return R; 12239 12240 // Want to form ANDNP nodes: 12241 // 1) In the hopes of then easily combining them with OR and AND nodes 12242 // to form PBLEND/PSIGN. 12243 // 2) To match ANDN packed intrinsics 12244 EVT VT = N->getValueType(0); 12245 if (VT != MVT::v2i64 && VT != MVT::v4i64) 12246 return SDValue(); 12247 12248 SDValue N0 = N->getOperand(0); 12249 SDValue N1 = N->getOperand(1); 12250 DebugLoc DL = N->getDebugLoc(); 12251 12252 // Check LHS for vnot 12253 if (N0.getOpcode() == ISD::XOR && 12254 //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) 12255 CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) 12256 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); 12257 12258 // Check RHS for vnot 12259 if (N1.getOpcode() == ISD::XOR && 12260 //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) 12261 CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) 12262 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); 12263 12264 return SDValue(); 12265} 12266 12267static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 12268 TargetLowering::DAGCombinerInfo &DCI, 12269 const X86Subtarget *Subtarget) { 12270 if (DCI.isBeforeLegalizeOps()) 12271 return SDValue(); 12272 12273 SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); 12274 if (R.getNode()) 12275 return R; 12276 12277 EVT VT = N->getValueType(0); 12278 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) 12279 return SDValue(); 12280 12281 SDValue N0 = N->getOperand(0); 12282 SDValue N1 = N->getOperand(1); 12283 12284 // look for psign/blend 12285 if (Subtarget->hasSSSE3()) { 12286 if (VT == MVT::v2i64) { 12287 // Canonicalize pandn to RHS 12288 if (N0.getOpcode() == X86ISD::ANDNP) 12289 std::swap(N0, N1); 12290 // or (and (m, x), (pandn m, y)) 12291 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { 12292 SDValue Mask = N1.getOperand(0); 12293 SDValue X = N1.getOperand(1); 12294 SDValue Y; 12295 if (N0.getOperand(0) == Mask) 12296 Y = N0.getOperand(1); 12297 if (N0.getOperand(1) == Mask) 12298 Y = N0.getOperand(0); 12299 12300 // Check to see if the mask appeared in both the AND and ANDNP and 12301 if (!Y.getNode()) 12302 return SDValue(); 12303 12304 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. 12305 if (Mask.getOpcode() != ISD::BITCAST || 12306 X.getOpcode() != ISD::BITCAST || 12307 Y.getOpcode() != ISD::BITCAST) 12308 return SDValue(); 12309 12310 // Look through mask bitcast. 12311 Mask = Mask.getOperand(0); 12312 EVT MaskVT = Mask.getValueType(); 12313 12314 // Validate that the Mask operand is a vector sra node. The sra node 12315 // will be an intrinsic. 12316 if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) 12317 return SDValue(); 12318 12319 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but 12320 // there is no psrai.b 12321 switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { 12322 case Intrinsic::x86_sse2_psrai_w: 12323 case Intrinsic::x86_sse2_psrai_d: 12324 break; 12325 default: return SDValue(); 12326 } 12327 12328 // Check that the SRA is all signbits. 12329 SDValue SraC = Mask.getOperand(2); 12330 unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); 12331 unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); 12332 if ((SraAmt + 1) != EltBits) 12333 return SDValue(); 12334 12335 DebugLoc DL = N->getDebugLoc(); 12336 12337 // Now we know we at least have a plendvb with the mask val. See if 12338 // we can form a psignb/w/d. 12339 // psign = x.type == y.type == mask.type && y = sub(0, x); 12340 X = X.getOperand(0); 12341 Y = Y.getOperand(0); 12342 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && 12343 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && 12344 X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ 12345 unsigned Opc = 0; 12346 switch (EltBits) { 12347 case 8: Opc = X86ISD::PSIGNB; break; 12348 case 16: Opc = X86ISD::PSIGNW; break; 12349 case 32: Opc = X86ISD::PSIGND; break; 12350 default: break; 12351 } 12352 if (Opc) { 12353 SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); 12354 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); 12355 } 12356 } 12357 // PBLENDVB only available on SSE 4.1 12358 if (!Subtarget->hasSSE41()) 12359 return SDValue(); 12360 12361 X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); 12362 Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); 12363 Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); 12364 Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); 12365 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); 12366 } 12367 } 12368 } 12369 12370 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 12371 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 12372 std::swap(N0, N1); 12373 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 12374 return SDValue(); 12375 if (!N0.hasOneUse() || !N1.hasOneUse()) 12376 return SDValue(); 12377 12378 SDValue ShAmt0 = N0.getOperand(1); 12379 if (ShAmt0.getValueType() != MVT::i8) 12380 return SDValue(); 12381 SDValue ShAmt1 = N1.getOperand(1); 12382 if (ShAmt1.getValueType() != MVT::i8) 12383 return SDValue(); 12384 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 12385 ShAmt0 = ShAmt0.getOperand(0); 12386 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 12387 ShAmt1 = ShAmt1.getOperand(0); 12388 12389 DebugLoc DL = N->getDebugLoc(); 12390 unsigned Opc = X86ISD::SHLD; 12391 SDValue Op0 = N0.getOperand(0); 12392 SDValue Op1 = N1.getOperand(0); 12393 if (ShAmt0.getOpcode() == ISD::SUB) { 12394 Opc = X86ISD::SHRD; 12395 std::swap(Op0, Op1); 12396 std::swap(ShAmt0, ShAmt1); 12397 } 12398 12399 unsigned Bits = VT.getSizeInBits(); 12400 if (ShAmt1.getOpcode() == ISD::SUB) { 12401 SDValue Sum = ShAmt1.getOperand(0); 12402 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 12403 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 12404 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 12405 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 12406 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 12407 return DAG.getNode(Opc, DL, VT, 12408 Op0, Op1, 12409 DAG.getNode(ISD::TRUNCATE, DL, 12410 MVT::i8, ShAmt0)); 12411 } 12412 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 12413 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 12414 if (ShAmt0C && 12415 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 12416 return DAG.getNode(Opc, DL, VT, 12417 N0.getOperand(0), N1.getOperand(0), 12418 DAG.getNode(ISD::TRUNCATE, DL, 12419 MVT::i8, ShAmt0)); 12420 } 12421 12422 return SDValue(); 12423} 12424 12425/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 12426static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 12427 const X86Subtarget *Subtarget) { 12428 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 12429 // the FP state in cases where an emms may be missing. 12430 // A preferable solution to the general problem is to figure out the right 12431 // places to insert EMMS. This qualifies as a quick hack. 12432 12433 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 12434 StoreSDNode *St = cast<StoreSDNode>(N); 12435 EVT VT = St->getValue().getValueType(); 12436 if (VT.getSizeInBits() != 64) 12437 return SDValue(); 12438 12439 const Function *F = DAG.getMachineFunction().getFunction(); 12440 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 12441 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 12442 && Subtarget->hasSSE2(); 12443 if ((VT.isVector() || 12444 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 12445 isa<LoadSDNode>(St->getValue()) && 12446 !cast<LoadSDNode>(St->getValue())->isVolatile() && 12447 St->getChain().hasOneUse() && !St->isVolatile()) { 12448 SDNode* LdVal = St->getValue().getNode(); 12449 LoadSDNode *Ld = 0; 12450 int TokenFactorIndex = -1; 12451 SmallVector<SDValue, 8> Ops; 12452 SDNode* ChainVal = St->getChain().getNode(); 12453 // Must be a store of a load. We currently handle two cases: the load 12454 // is a direct child, and it's under an intervening TokenFactor. It is 12455 // possible to dig deeper under nested TokenFactors. 12456 if (ChainVal == LdVal) 12457 Ld = cast<LoadSDNode>(St->getChain()); 12458 else if (St->getValue().hasOneUse() && 12459 ChainVal->getOpcode() == ISD::TokenFactor) { 12460 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 12461 if (ChainVal->getOperand(i).getNode() == LdVal) { 12462 TokenFactorIndex = i; 12463 Ld = cast<LoadSDNode>(St->getValue()); 12464 } else 12465 Ops.push_back(ChainVal->getOperand(i)); 12466 } 12467 } 12468 12469 if (!Ld || !ISD::isNormalLoad(Ld)) 12470 return SDValue(); 12471 12472 // If this is not the MMX case, i.e. we are just turning i64 load/store 12473 // into f64 load/store, avoid the transformation if there are multiple 12474 // uses of the loaded value. 12475 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 12476 return SDValue(); 12477 12478 DebugLoc LdDL = Ld->getDebugLoc(); 12479 DebugLoc StDL = N->getDebugLoc(); 12480 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 12481 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 12482 // pair instead. 12483 if (Subtarget->is64Bit() || F64IsLegal) { 12484 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 12485 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 12486 Ld->getPointerInfo(), Ld->isVolatile(), 12487 Ld->isNonTemporal(), Ld->getAlignment()); 12488 SDValue NewChain = NewLd.getValue(1); 12489 if (TokenFactorIndex != -1) { 12490 Ops.push_back(NewChain); 12491 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12492 Ops.size()); 12493 } 12494 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 12495 St->getPointerInfo(), 12496 St->isVolatile(), St->isNonTemporal(), 12497 St->getAlignment()); 12498 } 12499 12500 // Otherwise, lower to two pairs of 32-bit loads / stores. 12501 SDValue LoAddr = Ld->getBasePtr(); 12502 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 12503 DAG.getConstant(4, MVT::i32)); 12504 12505 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 12506 Ld->getPointerInfo(), 12507 Ld->isVolatile(), Ld->isNonTemporal(), 12508 Ld->getAlignment()); 12509 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 12510 Ld->getPointerInfo().getWithOffset(4), 12511 Ld->isVolatile(), Ld->isNonTemporal(), 12512 MinAlign(Ld->getAlignment(), 4)); 12513 12514 SDValue NewChain = LoLd.getValue(1); 12515 if (TokenFactorIndex != -1) { 12516 Ops.push_back(LoLd); 12517 Ops.push_back(HiLd); 12518 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 12519 Ops.size()); 12520 } 12521 12522 LoAddr = St->getBasePtr(); 12523 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 12524 DAG.getConstant(4, MVT::i32)); 12525 12526 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 12527 St->getPointerInfo(), 12528 St->isVolatile(), St->isNonTemporal(), 12529 St->getAlignment()); 12530 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 12531 St->getPointerInfo().getWithOffset(4), 12532 St->isVolatile(), 12533 St->isNonTemporal(), 12534 MinAlign(St->getAlignment(), 4)); 12535 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 12536 } 12537 return SDValue(); 12538} 12539 12540/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 12541/// X86ISD::FXOR nodes. 12542static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 12543 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 12544 // F[X]OR(0.0, x) -> x 12545 // F[X]OR(x, 0.0) -> x 12546 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12547 if (C->getValueAPF().isPosZero()) 12548 return N->getOperand(1); 12549 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12550 if (C->getValueAPF().isPosZero()) 12551 return N->getOperand(0); 12552 return SDValue(); 12553} 12554 12555/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 12556static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 12557 // FAND(0.0, x) -> 0.0 12558 // FAND(x, 0.0) -> 0.0 12559 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 12560 if (C->getValueAPF().isPosZero()) 12561 return N->getOperand(0); 12562 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 12563 if (C->getValueAPF().isPosZero()) 12564 return N->getOperand(1); 12565 return SDValue(); 12566} 12567 12568static SDValue PerformBTCombine(SDNode *N, 12569 SelectionDAG &DAG, 12570 TargetLowering::DAGCombinerInfo &DCI) { 12571 // BT ignores high bits in the bit index operand. 12572 SDValue Op1 = N->getOperand(1); 12573 if (Op1.hasOneUse()) { 12574 unsigned BitWidth = Op1.getValueSizeInBits(); 12575 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 12576 APInt KnownZero, KnownOne; 12577 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 12578 !DCI.isBeforeLegalizeOps()); 12579 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12580 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 12581 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 12582 DCI.CommitTargetLoweringOpt(TLO); 12583 } 12584 return SDValue(); 12585} 12586 12587static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 12588 SDValue Op = N->getOperand(0); 12589 if (Op.getOpcode() == ISD::BITCAST) 12590 Op = Op.getOperand(0); 12591 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 12592 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 12593 VT.getVectorElementType().getSizeInBits() == 12594 OpVT.getVectorElementType().getSizeInBits()) { 12595 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); 12596 } 12597 return SDValue(); 12598} 12599 12600static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 12601 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 12602 // (and (i32 x86isd::setcc_carry), 1) 12603 // This eliminates the zext. This transformation is necessary because 12604 // ISD::SETCC is always legalized to i8. 12605 DebugLoc dl = N->getDebugLoc(); 12606 SDValue N0 = N->getOperand(0); 12607 EVT VT = N->getValueType(0); 12608 if (N0.getOpcode() == ISD::AND && 12609 N0.hasOneUse() && 12610 N0.getOperand(0).hasOneUse()) { 12611 SDValue N00 = N0.getOperand(0); 12612 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 12613 return SDValue(); 12614 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 12615 if (!C || C->getZExtValue() != 1) 12616 return SDValue(); 12617 return DAG.getNode(ISD::AND, dl, VT, 12618 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 12619 N00.getOperand(0), N00.getOperand(1)), 12620 DAG.getConstant(1, VT)); 12621 } 12622 12623 return SDValue(); 12624} 12625 12626// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT 12627static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { 12628 unsigned X86CC = N->getConstantOperandVal(0); 12629 SDValue EFLAG = N->getOperand(1); 12630 DebugLoc DL = N->getDebugLoc(); 12631 12632 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without 12633 // a zext and produces an all-ones bit which is more useful than 0/1 in some 12634 // cases. 12635 if (X86CC == X86::COND_B) 12636 return DAG.getNode(ISD::AND, DL, MVT::i8, 12637 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, 12638 DAG.getConstant(X86CC, MVT::i8), EFLAG), 12639 DAG.getConstant(1, MVT::i8)); 12640 12641 return SDValue(); 12642} 12643 12644static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, 12645 const X86TargetLowering *XTLI) { 12646 SDValue Op0 = N->getOperand(0); 12647 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have 12648 // a 32-bit target where SSE doesn't support i64->FP operations. 12649 if (Op0.getOpcode() == ISD::LOAD) { 12650 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); 12651 EVT VT = Ld->getValueType(0); 12652 if (!Ld->isVolatile() && !N->getValueType(0).isVector() && 12653 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && 12654 !XTLI->getSubtarget()->is64Bit() && 12655 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 12656 SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), 12657 Ld->getChain(), Op0, DAG); 12658 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); 12659 return FILDChain; 12660 } 12661 } 12662 return SDValue(); 12663} 12664 12665// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS 12666static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, 12667 X86TargetLowering::DAGCombinerInfo &DCI) { 12668 // If the LHS and RHS of the ADC node are zero, then it can't overflow and 12669 // the result is either zero or one (depending on the input carry bit). 12670 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. 12671 if (X86::isZeroNode(N->getOperand(0)) && 12672 X86::isZeroNode(N->getOperand(1)) && 12673 // We don't have a good way to replace an EFLAGS use, so only do this when 12674 // dead right now. 12675 SDValue(N, 1).use_empty()) { 12676 DebugLoc DL = N->getDebugLoc(); 12677 EVT VT = N->getValueType(0); 12678 SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); 12679 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, 12680 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, 12681 DAG.getConstant(X86::COND_B,MVT::i8), 12682 N->getOperand(2)), 12683 DAG.getConstant(1, VT)); 12684 return DCI.CombineTo(N, Res1, CarryOut); 12685 } 12686 12687 return SDValue(); 12688} 12689 12690// fold (add Y, (sete X, 0)) -> adc 0, Y 12691// (add Y, (setne X, 0)) -> sbb -1, Y 12692// (sub (sete X, 0), Y) -> sbb 0, Y 12693// (sub (setne X, 0), Y) -> adc -1, Y 12694static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { 12695 DebugLoc DL = N->getDebugLoc(); 12696 12697 // Look through ZExts. 12698 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); 12699 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) 12700 return SDValue(); 12701 12702 SDValue SetCC = Ext.getOperand(0); 12703 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) 12704 return SDValue(); 12705 12706 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); 12707 if (CC != X86::COND_E && CC != X86::COND_NE) 12708 return SDValue(); 12709 12710 SDValue Cmp = SetCC.getOperand(1); 12711 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || 12712 !X86::isZeroNode(Cmp.getOperand(1)) || 12713 !Cmp.getOperand(0).getValueType().isInteger()) 12714 return SDValue(); 12715 12716 SDValue CmpOp0 = Cmp.getOperand(0); 12717 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, 12718 DAG.getConstant(1, CmpOp0.getValueType())); 12719 12720 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); 12721 if (CC == X86::COND_NE) 12722 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, 12723 DL, OtherVal.getValueType(), OtherVal, 12724 DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); 12725 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, 12726 DL, OtherVal.getValueType(), OtherVal, 12727 DAG.getConstant(0, OtherVal.getValueType()), NewCmp); 12728} 12729 12730static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { 12731 SDValue Op0 = N->getOperand(0); 12732 SDValue Op1 = N->getOperand(1); 12733 12734 // X86 can't encode an immediate LHS of a sub. See if we can push the 12735 // negation into a preceding instruction. 12736 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { 12737 uint64_t Op0C = C->getSExtValue(); 12738 12739 // If the RHS of the sub is a XOR with one use and a constant, invert the 12740 // immediate. Then add one to the LHS of the sub so we can turn 12741 // X-Y -> X+~Y+1, saving one register. 12742 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && 12743 isa<ConstantSDNode>(Op1.getOperand(1))) { 12744 uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue(); 12745 EVT VT = Op0.getValueType(); 12746 SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, 12747 Op1.getOperand(0), 12748 DAG.getConstant(~XorC, VT)); 12749 return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, 12750 DAG.getConstant(Op0C+1, VT)); 12751 } 12752 } 12753 12754 return OptimizeConditionalInDecrement(N, DAG); 12755} 12756 12757SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 12758 DAGCombinerInfo &DCI) const { 12759 SelectionDAG &DAG = DCI.DAG; 12760 switch (N->getOpcode()) { 12761 default: break; 12762 case ISD::EXTRACT_VECTOR_ELT: 12763 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 12764 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 12765 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 12766 case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); 12767 case ISD::SUB: return PerformSubCombine(N, DAG); 12768 case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); 12769 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 12770 case ISD::SHL: 12771 case ISD::SRA: 12772 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 12773 case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); 12774 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 12775 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 12776 case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); 12777 case X86ISD::FXOR: 12778 case X86ISD::FOR: return PerformFORCombine(N, DAG); 12779 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 12780 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 12781 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 12782 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 12783 case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); 12784 case X86ISD::SHUFPS: // Handle all target specific shuffles 12785 case X86ISD::SHUFPD: 12786 case X86ISD::PALIGN: 12787 case X86ISD::PUNPCKHBW: 12788 case X86ISD::PUNPCKHWD: 12789 case X86ISD::PUNPCKHDQ: 12790 case X86ISD::PUNPCKHQDQ: 12791 case X86ISD::UNPCKHPS: 12792 case X86ISD::UNPCKHPD: 12793 case X86ISD::VUNPCKHPSY: 12794 case X86ISD::VUNPCKHPDY: 12795 case X86ISD::PUNPCKLBW: 12796 case X86ISD::PUNPCKLWD: 12797 case X86ISD::PUNPCKLDQ: 12798 case X86ISD::PUNPCKLQDQ: 12799 case X86ISD::UNPCKLPS: 12800 case X86ISD::UNPCKLPD: 12801 case X86ISD::VUNPCKLPSY: 12802 case X86ISD::VUNPCKLPDY: 12803 case X86ISD::MOVHLPS: 12804 case X86ISD::MOVLHPS: 12805 case X86ISD::PSHUFD: 12806 case X86ISD::PSHUFHW: 12807 case X86ISD::PSHUFLW: 12808 case X86ISD::MOVSS: 12809 case X86ISD::MOVSD: 12810 case X86ISD::VPERMILPS: 12811 case X86ISD::VPERMILPSY: 12812 case X86ISD::VPERMILPD: 12813 case X86ISD::VPERMILPDY: 12814 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); 12815 } 12816 12817 return SDValue(); 12818} 12819 12820/// isTypeDesirableForOp - Return true if the target has native support for 12821/// the specified value type and it is 'desirable' to use the type for the 12822/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 12823/// instruction encodings are longer and some i16 instructions are slow. 12824bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 12825 if (!isTypeLegal(VT)) 12826 return false; 12827 if (VT != MVT::i16) 12828 return true; 12829 12830 switch (Opc) { 12831 default: 12832 return true; 12833 case ISD::LOAD: 12834 case ISD::SIGN_EXTEND: 12835 case ISD::ZERO_EXTEND: 12836 case ISD::ANY_EXTEND: 12837 case ISD::SHL: 12838 case ISD::SRL: 12839 case ISD::SUB: 12840 case ISD::ADD: 12841 case ISD::MUL: 12842 case ISD::AND: 12843 case ISD::OR: 12844 case ISD::XOR: 12845 return false; 12846 } 12847} 12848 12849/// IsDesirableToPromoteOp - This method query the target whether it is 12850/// beneficial for dag combiner to promote the specified node. If true, it 12851/// should return the desired promotion type by reference. 12852bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 12853 EVT VT = Op.getValueType(); 12854 if (VT != MVT::i16) 12855 return false; 12856 12857 bool Promote = false; 12858 bool Commute = false; 12859 switch (Op.getOpcode()) { 12860 default: break; 12861 case ISD::LOAD: { 12862 LoadSDNode *LD = cast<LoadSDNode>(Op); 12863 // If the non-extending load has a single use and it's not live out, then it 12864 // might be folded. 12865 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 12866 Op.hasOneUse()*/) { 12867 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 12868 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 12869 // The only case where we'd want to promote LOAD (rather then it being 12870 // promoted as an operand is when it's only use is liveout. 12871 if (UI->getOpcode() != ISD::CopyToReg) 12872 return false; 12873 } 12874 } 12875 Promote = true; 12876 break; 12877 } 12878 case ISD::SIGN_EXTEND: 12879 case ISD::ZERO_EXTEND: 12880 case ISD::ANY_EXTEND: 12881 Promote = true; 12882 break; 12883 case ISD::SHL: 12884 case ISD::SRL: { 12885 SDValue N0 = Op.getOperand(0); 12886 // Look out for (store (shl (load), x)). 12887 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 12888 return false; 12889 Promote = true; 12890 break; 12891 } 12892 case ISD::ADD: 12893 case ISD::MUL: 12894 case ISD::AND: 12895 case ISD::OR: 12896 case ISD::XOR: 12897 Commute = true; 12898 // fallthrough 12899 case ISD::SUB: { 12900 SDValue N0 = Op.getOperand(0); 12901 SDValue N1 = Op.getOperand(1); 12902 if (!Commute && MayFoldLoad(N1)) 12903 return false; 12904 // Avoid disabling potential load folding opportunities. 12905 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 12906 return false; 12907 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 12908 return false; 12909 Promote = true; 12910 } 12911 } 12912 12913 PVT = MVT::i32; 12914 return Promote; 12915} 12916 12917//===----------------------------------------------------------------------===// 12918// X86 Inline Assembly Support 12919//===----------------------------------------------------------------------===// 12920 12921bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 12922 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 12923 12924 std::string AsmStr = IA->getAsmString(); 12925 12926 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 12927 SmallVector<StringRef, 4> AsmPieces; 12928 SplitString(AsmStr, AsmPieces, ";\n"); 12929 12930 switch (AsmPieces.size()) { 12931 default: return false; 12932 case 1: 12933 AsmStr = AsmPieces[0]; 12934 AsmPieces.clear(); 12935 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 12936 12937 // FIXME: this should verify that we are targeting a 486 or better. If not, 12938 // we will turn this bswap into something that will be lowered to logical ops 12939 // instead of emitting the bswap asm. For now, we don't support 486 or lower 12940 // so don't worry about this. 12941 // bswap $0 12942 if (AsmPieces.size() == 2 && 12943 (AsmPieces[0] == "bswap" || 12944 AsmPieces[0] == "bswapq" || 12945 AsmPieces[0] == "bswapl") && 12946 (AsmPieces[1] == "$0" || 12947 AsmPieces[1] == "${0:q}")) { 12948 // No need to check constraints, nothing other than the equivalent of 12949 // "=r,0" would be valid here. 12950 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12951 if (!Ty || Ty->getBitWidth() % 16 != 0) 12952 return false; 12953 return IntrinsicLowering::LowerToByteSwap(CI); 12954 } 12955 // rorw $$8, ${0:w} --> llvm.bswap.i16 12956 if (CI->getType()->isIntegerTy(16) && 12957 AsmPieces.size() == 3 && 12958 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 12959 AsmPieces[1] == "$$8," && 12960 AsmPieces[2] == "${0:w}" && 12961 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12962 AsmPieces.clear(); 12963 const std::string &ConstraintsStr = IA->getConstraintString(); 12964 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12965 std::sort(AsmPieces.begin(), AsmPieces.end()); 12966 if (AsmPieces.size() == 4 && 12967 AsmPieces[0] == "~{cc}" && 12968 AsmPieces[1] == "~{dirflag}" && 12969 AsmPieces[2] == "~{flags}" && 12970 AsmPieces[3] == "~{fpsr}") { 12971 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 12972 if (!Ty || Ty->getBitWidth() % 16 != 0) 12973 return false; 12974 return IntrinsicLowering::LowerToByteSwap(CI); 12975 } 12976 } 12977 break; 12978 case 3: 12979 if (CI->getType()->isIntegerTy(32) && 12980 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 12981 SmallVector<StringRef, 4> Words; 12982 SplitString(AsmPieces[0], Words, " \t,"); 12983 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12984 Words[2] == "${0:w}") { 12985 Words.clear(); 12986 SplitString(AsmPieces[1], Words, " \t,"); 12987 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 12988 Words[2] == "$0") { 12989 Words.clear(); 12990 SplitString(AsmPieces[2], Words, " \t,"); 12991 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 12992 Words[2] == "${0:w}") { 12993 AsmPieces.clear(); 12994 const std::string &ConstraintsStr = IA->getConstraintString(); 12995 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); 12996 std::sort(AsmPieces.begin(), AsmPieces.end()); 12997 if (AsmPieces.size() == 4 && 12998 AsmPieces[0] == "~{cc}" && 12999 AsmPieces[1] == "~{dirflag}" && 13000 AsmPieces[2] == "~{flags}" && 13001 AsmPieces[3] == "~{fpsr}") { 13002 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13003 if (!Ty || Ty->getBitWidth() % 16 != 0) 13004 return false; 13005 return IntrinsicLowering::LowerToByteSwap(CI); 13006 } 13007 } 13008 } 13009 } 13010 } 13011 13012 if (CI->getType()->isIntegerTy(64)) { 13013 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 13014 if (Constraints.size() >= 2 && 13015 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 13016 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 13017 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 13018 SmallVector<StringRef, 4> Words; 13019 SplitString(AsmPieces[0], Words, " \t"); 13020 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 13021 Words.clear(); 13022 SplitString(AsmPieces[1], Words, " \t"); 13023 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 13024 Words.clear(); 13025 SplitString(AsmPieces[2], Words, " \t,"); 13026 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 13027 Words[2] == "%edx") { 13028 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 13029 if (!Ty || Ty->getBitWidth() % 16 != 0) 13030 return false; 13031 return IntrinsicLowering::LowerToByteSwap(CI); 13032 } 13033 } 13034 } 13035 } 13036 } 13037 break; 13038 } 13039 return false; 13040} 13041 13042 13043 13044/// getConstraintType - Given a constraint letter, return the type of 13045/// constraint it is for this target. 13046X86TargetLowering::ConstraintType 13047X86TargetLowering::getConstraintType(const std::string &Constraint) const { 13048 if (Constraint.size() == 1) { 13049 switch (Constraint[0]) { 13050 case 'R': 13051 case 'q': 13052 case 'Q': 13053 case 'f': 13054 case 't': 13055 case 'u': 13056 case 'y': 13057 case 'x': 13058 case 'Y': 13059 case 'l': 13060 return C_RegisterClass; 13061 case 'a': 13062 case 'b': 13063 case 'c': 13064 case 'd': 13065 case 'S': 13066 case 'D': 13067 case 'A': 13068 return C_Register; 13069 case 'I': 13070 case 'J': 13071 case 'K': 13072 case 'L': 13073 case 'M': 13074 case 'N': 13075 case 'G': 13076 case 'C': 13077 case 'e': 13078 case 'Z': 13079 return C_Other; 13080 default: 13081 break; 13082 } 13083 } 13084 return TargetLowering::getConstraintType(Constraint); 13085} 13086 13087/// Examine constraint type and operand type and determine a weight value. 13088/// This object must already have been set up with the operand type 13089/// and the current alternative constraint selected. 13090TargetLowering::ConstraintWeight 13091 X86TargetLowering::getSingleConstraintMatchWeight( 13092 AsmOperandInfo &info, const char *constraint) const { 13093 ConstraintWeight weight = CW_Invalid; 13094 Value *CallOperandVal = info.CallOperandVal; 13095 // If we don't have a value, we can't do a match, 13096 // but allow it at the lowest weight. 13097 if (CallOperandVal == NULL) 13098 return CW_Default; 13099 Type *type = CallOperandVal->getType(); 13100 // Look at the constraint type. 13101 switch (*constraint) { 13102 default: 13103 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 13104 case 'R': 13105 case 'q': 13106 case 'Q': 13107 case 'a': 13108 case 'b': 13109 case 'c': 13110 case 'd': 13111 case 'S': 13112 case 'D': 13113 case 'A': 13114 if (CallOperandVal->getType()->isIntegerTy()) 13115 weight = CW_SpecificReg; 13116 break; 13117 case 'f': 13118 case 't': 13119 case 'u': 13120 if (type->isFloatingPointTy()) 13121 weight = CW_SpecificReg; 13122 break; 13123 case 'y': 13124 if (type->isX86_MMXTy() && Subtarget->hasMMX()) 13125 weight = CW_SpecificReg; 13126 break; 13127 case 'x': 13128 case 'Y': 13129 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) 13130 weight = CW_Register; 13131 break; 13132 case 'I': 13133 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 13134 if (C->getZExtValue() <= 31) 13135 weight = CW_Constant; 13136 } 13137 break; 13138 case 'J': 13139 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13140 if (C->getZExtValue() <= 63) 13141 weight = CW_Constant; 13142 } 13143 break; 13144 case 'K': 13145 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13146 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 13147 weight = CW_Constant; 13148 } 13149 break; 13150 case 'L': 13151 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13152 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 13153 weight = CW_Constant; 13154 } 13155 break; 13156 case 'M': 13157 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13158 if (C->getZExtValue() <= 3) 13159 weight = CW_Constant; 13160 } 13161 break; 13162 case 'N': 13163 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13164 if (C->getZExtValue() <= 0xff) 13165 weight = CW_Constant; 13166 } 13167 break; 13168 case 'G': 13169 case 'C': 13170 if (dyn_cast<ConstantFP>(CallOperandVal)) { 13171 weight = CW_Constant; 13172 } 13173 break; 13174 case 'e': 13175 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13176 if ((C->getSExtValue() >= -0x80000000LL) && 13177 (C->getSExtValue() <= 0x7fffffffLL)) 13178 weight = CW_Constant; 13179 } 13180 break; 13181 case 'Z': 13182 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 13183 if (C->getZExtValue() <= 0xffffffff) 13184 weight = CW_Constant; 13185 } 13186 break; 13187 } 13188 return weight; 13189} 13190 13191/// LowerXConstraint - try to replace an X constraint, which matches anything, 13192/// with another that has more specific requirements based on the type of the 13193/// corresponding operand. 13194const char *X86TargetLowering:: 13195LowerXConstraint(EVT ConstraintVT) const { 13196 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 13197 // 'f' like normal targets. 13198 if (ConstraintVT.isFloatingPoint()) { 13199 if (Subtarget->hasXMMInt()) 13200 return "Y"; 13201 if (Subtarget->hasXMM()) 13202 return "x"; 13203 } 13204 13205 return TargetLowering::LowerXConstraint(ConstraintVT); 13206} 13207 13208/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 13209/// vector. If it is invalid, don't add anything to Ops. 13210void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 13211 std::string &Constraint, 13212 std::vector<SDValue>&Ops, 13213 SelectionDAG &DAG) const { 13214 SDValue Result(0, 0); 13215 13216 // Only support length 1 constraints for now. 13217 if (Constraint.length() > 1) return; 13218 13219 char ConstraintLetter = Constraint[0]; 13220 switch (ConstraintLetter) { 13221 default: break; 13222 case 'I': 13223 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13224 if (C->getZExtValue() <= 31) { 13225 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13226 break; 13227 } 13228 } 13229 return; 13230 case 'J': 13231 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13232 if (C->getZExtValue() <= 63) { 13233 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13234 break; 13235 } 13236 } 13237 return; 13238 case 'K': 13239 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13240 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 13241 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13242 break; 13243 } 13244 } 13245 return; 13246 case 'N': 13247 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13248 if (C->getZExtValue() <= 255) { 13249 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13250 break; 13251 } 13252 } 13253 return; 13254 case 'e': { 13255 // 32-bit signed value 13256 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13257 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13258 C->getSExtValue())) { 13259 // Widen to 64 bits here to get it sign extended. 13260 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 13261 break; 13262 } 13263 // FIXME gcc accepts some relocatable values here too, but only in certain 13264 // memory models; it's complicated. 13265 } 13266 return; 13267 } 13268 case 'Z': { 13269 // 32-bit unsigned value 13270 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 13271 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 13272 C->getZExtValue())) { 13273 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 13274 break; 13275 } 13276 } 13277 // FIXME gcc accepts some relocatable values here too, but only in certain 13278 // memory models; it's complicated. 13279 return; 13280 } 13281 case 'i': { 13282 // Literal immediates are always ok. 13283 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 13284 // Widen to 64 bits here to get it sign extended. 13285 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 13286 break; 13287 } 13288 13289 // In any sort of PIC mode addresses need to be computed at runtime by 13290 // adding in a register or some sort of table lookup. These can't 13291 // be used as immediates. 13292 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 13293 return; 13294 13295 // If we are in non-pic codegen mode, we allow the address of a global (with 13296 // an optional displacement) to be used with 'i'. 13297 GlobalAddressSDNode *GA = 0; 13298 int64_t Offset = 0; 13299 13300 // Match either (GA), (GA+C), (GA+C1+C2), etc. 13301 while (1) { 13302 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 13303 Offset += GA->getOffset(); 13304 break; 13305 } else if (Op.getOpcode() == ISD::ADD) { 13306 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13307 Offset += C->getZExtValue(); 13308 Op = Op.getOperand(0); 13309 continue; 13310 } 13311 } else if (Op.getOpcode() == ISD::SUB) { 13312 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 13313 Offset += -C->getZExtValue(); 13314 Op = Op.getOperand(0); 13315 continue; 13316 } 13317 } 13318 13319 // Otherwise, this isn't something we can handle, reject it. 13320 return; 13321 } 13322 13323 const GlobalValue *GV = GA->getGlobal(); 13324 // If we require an extra load to get this address, as in PIC mode, we 13325 // can't accept it. 13326 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 13327 getTargetMachine()))) 13328 return; 13329 13330 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 13331 GA->getValueType(0), Offset); 13332 break; 13333 } 13334 } 13335 13336 if (Result.getNode()) { 13337 Ops.push_back(Result); 13338 return; 13339 } 13340 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 13341} 13342 13343std::pair<unsigned, const TargetRegisterClass*> 13344X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 13345 EVT VT) const { 13346 // First, see if this is a constraint that directly corresponds to an LLVM 13347 // register class. 13348 if (Constraint.size() == 1) { 13349 // GCC Constraint Letters 13350 switch (Constraint[0]) { 13351 default: break; 13352 // TODO: Slight differences here in allocation order and leaving 13353 // RIP in the class. Do they matter any more here than they do 13354 // in the normal allocation? 13355 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 13356 if (Subtarget->is64Bit()) { 13357 if (VT == MVT::i32 || VT == MVT::f32) 13358 return std::make_pair(0U, X86::GR32RegisterClass); 13359 else if (VT == MVT::i16) 13360 return std::make_pair(0U, X86::GR16RegisterClass); 13361 else if (VT == MVT::i8 || VT == MVT::i1) 13362 return std::make_pair(0U, X86::GR8RegisterClass); 13363 else if (VT == MVT::i64 || VT == MVT::f64) 13364 return std::make_pair(0U, X86::GR64RegisterClass); 13365 break; 13366 } 13367 // 32-bit fallthrough 13368 case 'Q': // Q_REGS 13369 if (VT == MVT::i32 || VT == MVT::f32) 13370 return std::make_pair(0U, X86::GR32_ABCDRegisterClass); 13371 else if (VT == MVT::i16) 13372 return std::make_pair(0U, X86::GR16_ABCDRegisterClass); 13373 else if (VT == MVT::i8 || VT == MVT::i1) 13374 return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); 13375 else if (VT == MVT::i64) 13376 return std::make_pair(0U, X86::GR64_ABCDRegisterClass); 13377 break; 13378 case 'r': // GENERAL_REGS 13379 case 'l': // INDEX_REGS 13380 if (VT == MVT::i8 || VT == MVT::i1) 13381 return std::make_pair(0U, X86::GR8RegisterClass); 13382 if (VT == MVT::i16) 13383 return std::make_pair(0U, X86::GR16RegisterClass); 13384 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) 13385 return std::make_pair(0U, X86::GR32RegisterClass); 13386 return std::make_pair(0U, X86::GR64RegisterClass); 13387 case 'R': // LEGACY_REGS 13388 if (VT == MVT::i8 || VT == MVT::i1) 13389 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 13390 if (VT == MVT::i16) 13391 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 13392 if (VT == MVT::i32 || !Subtarget->is64Bit()) 13393 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 13394 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 13395 case 'f': // FP Stack registers. 13396 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 13397 // value to the correct fpstack register class. 13398 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 13399 return std::make_pair(0U, X86::RFP32RegisterClass); 13400 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 13401 return std::make_pair(0U, X86::RFP64RegisterClass); 13402 return std::make_pair(0U, X86::RFP80RegisterClass); 13403 case 'y': // MMX_REGS if MMX allowed. 13404 if (!Subtarget->hasMMX()) break; 13405 return std::make_pair(0U, X86::VR64RegisterClass); 13406 case 'Y': // SSE_REGS if SSE2 allowed 13407 if (!Subtarget->hasXMMInt()) break; 13408 // FALL THROUGH. 13409 case 'x': // SSE_REGS if SSE1 allowed 13410 if (!Subtarget->hasXMM()) break; 13411 13412 switch (VT.getSimpleVT().SimpleTy) { 13413 default: break; 13414 // Scalar SSE types. 13415 case MVT::f32: 13416 case MVT::i32: 13417 return std::make_pair(0U, X86::FR32RegisterClass); 13418 case MVT::f64: 13419 case MVT::i64: 13420 return std::make_pair(0U, X86::FR64RegisterClass); 13421 // Vector types. 13422 case MVT::v16i8: 13423 case MVT::v8i16: 13424 case MVT::v4i32: 13425 case MVT::v2i64: 13426 case MVT::v4f32: 13427 case MVT::v2f64: 13428 return std::make_pair(0U, X86::VR128RegisterClass); 13429 } 13430 break; 13431 } 13432 } 13433 13434 // Use the default implementation in TargetLowering to convert the register 13435 // constraint into a member of a register class. 13436 std::pair<unsigned, const TargetRegisterClass*> Res; 13437 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 13438 13439 // Not found as a standard register? 13440 if (Res.second == 0) { 13441 // Map st(0) -> st(7) -> ST0 13442 if (Constraint.size() == 7 && Constraint[0] == '{' && 13443 tolower(Constraint[1]) == 's' && 13444 tolower(Constraint[2]) == 't' && 13445 Constraint[3] == '(' && 13446 (Constraint[4] >= '0' && Constraint[4] <= '7') && 13447 Constraint[5] == ')' && 13448 Constraint[6] == '}') { 13449 13450 Res.first = X86::ST0+Constraint[4]-'0'; 13451 Res.second = X86::RFP80RegisterClass; 13452 return Res; 13453 } 13454 13455 // GCC allows "st(0)" to be called just plain "st". 13456 if (StringRef("{st}").equals_lower(Constraint)) { 13457 Res.first = X86::ST0; 13458 Res.second = X86::RFP80RegisterClass; 13459 return Res; 13460 } 13461 13462 // flags -> EFLAGS 13463 if (StringRef("{flags}").equals_lower(Constraint)) { 13464 Res.first = X86::EFLAGS; 13465 Res.second = X86::CCRRegisterClass; 13466 return Res; 13467 } 13468 13469 // 'A' means EAX + EDX. 13470 if (Constraint == "A") { 13471 Res.first = X86::EAX; 13472 Res.second = X86::GR32_ADRegisterClass; 13473 return Res; 13474 } 13475 return Res; 13476 } 13477 13478 // Otherwise, check to see if this is a register class of the wrong value 13479 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 13480 // turn into {ax},{dx}. 13481 if (Res.second->hasType(VT)) 13482 return Res; // Correct type already, nothing to do. 13483 13484 // All of the single-register GCC register classes map their values onto 13485 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 13486 // really want an 8-bit or 32-bit register, map to the appropriate register 13487 // class and return the appropriate register. 13488 if (Res.second == X86::GR16RegisterClass) { 13489 if (VT == MVT::i8) { 13490 unsigned DestReg = 0; 13491 switch (Res.first) { 13492 default: break; 13493 case X86::AX: DestReg = X86::AL; break; 13494 case X86::DX: DestReg = X86::DL; break; 13495 case X86::CX: DestReg = X86::CL; break; 13496 case X86::BX: DestReg = X86::BL; break; 13497 } 13498 if (DestReg) { 13499 Res.first = DestReg; 13500 Res.second = X86::GR8RegisterClass; 13501 } 13502 } else if (VT == MVT::i32) { 13503 unsigned DestReg = 0; 13504 switch (Res.first) { 13505 default: break; 13506 case X86::AX: DestReg = X86::EAX; break; 13507 case X86::DX: DestReg = X86::EDX; break; 13508 case X86::CX: DestReg = X86::ECX; break; 13509 case X86::BX: DestReg = X86::EBX; break; 13510 case X86::SI: DestReg = X86::ESI; break; 13511 case X86::DI: DestReg = X86::EDI; break; 13512 case X86::BP: DestReg = X86::EBP; break; 13513 case X86::SP: DestReg = X86::ESP; break; 13514 } 13515 if (DestReg) { 13516 Res.first = DestReg; 13517 Res.second = X86::GR32RegisterClass; 13518 } 13519 } else if (VT == MVT::i64) { 13520 unsigned DestReg = 0; 13521 switch (Res.first) { 13522 default: break; 13523 case X86::AX: DestReg = X86::RAX; break; 13524 case X86::DX: DestReg = X86::RDX; break; 13525 case X86::CX: DestReg = X86::RCX; break; 13526 case X86::BX: DestReg = X86::RBX; break; 13527 case X86::SI: DestReg = X86::RSI; break; 13528 case X86::DI: DestReg = X86::RDI; break; 13529 case X86::BP: DestReg = X86::RBP; break; 13530 case X86::SP: DestReg = X86::RSP; break; 13531 } 13532 if (DestReg) { 13533 Res.first = DestReg; 13534 Res.second = X86::GR64RegisterClass; 13535 } 13536 } 13537 } else if (Res.second == X86::FR32RegisterClass || 13538 Res.second == X86::FR64RegisterClass || 13539 Res.second == X86::VR128RegisterClass) { 13540 // Handle references to XMM physical registers that got mapped into the 13541 // wrong class. This can happen with constraints like {xmm0} where the 13542 // target independent register mapper will just pick the first match it can 13543 // find, ignoring the required type. 13544 if (VT == MVT::f32) 13545 Res.second = X86::FR32RegisterClass; 13546 else if (VT == MVT::f64) 13547 Res.second = X86::FR64RegisterClass; 13548 else if (X86::VR128RegisterClass->hasType(VT)) 13549 Res.second = X86::VR128RegisterClass; 13550 } 13551 13552 return Res; 13553} 13554